150 files changed, 2211 insertions, 1677 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index fc06fd27065e..dd6f7ee1e312 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -610,6 +610,9 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
                 page, (unsigned long)filp->private_data);
+        /* Update file times before taking page lock */
+        file_update_time(filp);
        v9inode = V9FS_I(inode);
        /* make sure the cache has finished storing the page */
        v9fs_fscache_wait_on_page_write(inode, page);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1feb68ecef95..842d00048a65 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -94,25 +94,21 @@ static struct dentry *get_next_positive_subdir(struct dentry *prev,
 {
        struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
        struct list_head *next;
-        struct dentry *p, *q;
+        struct dentry *q;
        spin_lock(&sbi->lookup_lock);
+        spin_lock(&root->d_lock);
-        if (prev == NULL) {
+        if (prev)
-                spin_lock(&root->d_lock);
+                next = prev->d_u.d_child.next;
+        else {
                prev = dget_dlock(root);
                next = prev->d_subdirs.next;
-                p = prev;
-                goto start;
        }
-        p = prev;
+cont:
-        spin_lock(&p->d_lock);
-again:
-        next = p->d_u.d_child.next;
-start:
        if (next == &root->d_subdirs) {
-                spin_unlock(&p->d_lock);
+                spin_unlock(&root->d_lock);
                spin_unlock(&sbi->lookup_lock);
                dput(prev);
                return NULL;
@@ -121,16 +117,15 @@ start:
        q = list_entry(next, struct dentry, d_u.d_child);
        spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
-        /* Negative dentry - try next */
+        /* Already gone or negative dentry (under construction) - try next */
-        if (!simple_positive(q)) {
+        if (q->d_count == 0 || !simple_positive(q)) {
-                spin_unlock(&p->d_lock);
+                spin_unlock(&q->d_lock);
-                lock_set_subclass(&q->d_lock.dep_map, 0, _RET_IP_);
+                next = q->d_u.d_child.next;
-                p = q;
+                goto cont;
-                goto again;
        }
        dget_dlock(q);
        spin_unlock(&q->d_lock);
-        spin_unlock(&p->d_lock);
+        spin_unlock(&root->d_lock);
        spin_unlock(&sbi->lookup_lock);
        dput(prev);
@@ -404,11 +399,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
                        DPRINTK("checking mountpoint %p %.*s",
                                dentry, (int)dentry->d_name.len, dentry->d_name.name);
-                        /* Path walk currently on this dentry? */
-                        ino_count = atomic_read(&ino->count) + 2;
-                        if (dentry->d_count > ino_count)
-                                goto next;
                        /* Can we umount this guy */
                        if (autofs4_mount_busy(mnt, dentry))
                                goto next;
diff --git a/fs/bio.c b/fs/bio.c
index 73922abba832..71072ab99128 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -73,7 +73,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
 {
        unsigned int sz = sizeof(struct bio) + extra_size;
        struct kmem_cache *slab = NULL;
-        struct bio_slab *bslab;
+        struct bio_slab *bslab, *new_bio_slabs;
        unsigned int i, entry = -1;
        mutex_lock(&bio_slab_lock);
@@ -97,11 +97,12 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
        if (bio_slab_nr == bio_slab_max && entry == -1) {
                bio_slab_max <<= 1;
-                bio_slabs = krealloc(bio_slabs,
+                new_bio_slabs = krealloc(bio_slabs,
-                                     bio_slab_max * sizeof(struct bio_slab),
+                                         bio_slab_max * sizeof(struct bio_slab),
-                                     GFP_KERNEL);
+                                         GFP_KERNEL);
-                if (!bio_slabs)
+                if (!new_bio_slabs)
                        goto out_unlock;
+                bio_slabs = new_bio_slabs;
        }
        if (entry == -1)
                entry = bio_slab_nr++;
@@ -1312,7 +1313,7 @@ EXPORT_SYMBOL(bio_copy_kern);
 * Note that this code is very hard to test under normal circumstances because
 * direct-io pins the pages with get_user_pages().  This makes
 * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, pdflush) could clean the pages if they are mapped
+ * But other code (eg, flusher threads) could clean the pages if they are mapped
 * pagecache.
 *
 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e519195d45b..38e721b35d45 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1578,10 +1578,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
+        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1590,6 +1592,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index a256f3b2a845..ff6475f409d6 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1438,10 +1438,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
        ret = extent_from_logical(fs_info, logical, path,
                                        &found_key);
        btrfs_release_path(path);
-        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-                ret = -EINVAL;
        if (ret < 0)
                return ret;
+        if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+                return -EINVAL;
        extent_item_pos = logical - found_key.objectid;
        ret = iterate_extent_inodes(fs_info, found_key.objectid,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 86eff48dab78..43d1c5a3a030 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -818,6 +818,7 @@ static void free_workspace(int type, struct list_head *workspace)
        btrfs_compress_op[idx]->free_workspace(workspace);
        atomic_dec(alloc_workspace);
 wake:
+        smp_mb();
        if (waitqueue_active(workspace_wait))
                wake_up(workspace_wait);
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9d7621f271ff..6d183f60d63a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -421,12 +421,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
        spin_unlock(&fs_info->tree_mod_seq_lock);
        /*
-         * we removed the lowest blocker from the blocker list, so there may be
-         * more processible delayed refs.
-         */
-        wake_up(&fs_info->tree_mod_seq_wait);
-        /*
         * anything that's lower than the lowest existing (read: blocked)
         * sequence number can be removed from the tree.
         */
@@ -631,6 +625,9 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
        u32 nritems;
        int ret;
+        if (btrfs_header_level(eb) == 0)
+                return;
        nritems = btrfs_header_nritems(eb);
        for (i = nritems - 1; i >= 0; i--) {
                ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4bab807227ad..0d195b507660 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1252,7 +1252,6 @@ struct btrfs_fs_info {
        atomic_t tree_mod_seq;
        struct list_head tree_mod_seq_list;
        struct seq_list tree_mod_seq_elem;
-        wait_queue_head_t tree_mod_seq_wait;
        /* this protects tree_mod_log */
        rwlock_t tree_mod_log_lock;
@@ -3192,7 +3191,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-                              struct bio *bio, u64 logical_offset, u32 *dst);
+                              struct bio *bio, u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 335605c8ceab..07d5eeb1e6f1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -512,8 +512,8 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
        rb_erase(&delayed_item->rb_node, root);
        delayed_item->delayed_node->count--;
-        atomic_dec(&delayed_root->items);
+        if (atomic_dec_return(&delayed_root->items) <
-        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+            BTRFS_DELAYED_BACKGROUND &&
            waitqueue_active(&delayed_root->wait))
                wake_up(&delayed_root->wait);
 }
@@ -1028,9 +1028,10 @@ do_again:
                btrfs_release_delayed_item(prev);
                ret = 0;
                btrfs_release_path(path);
-                if (curr)
+                if (curr) {
+                        mutex_unlock(&node->mutex);
                        goto do_again;
-                else
+                } else
                        goto delete_fail;
        }
@@ -1055,8 +1056,7 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
                delayed_node->count--;
                delayed_root = delayed_node->root->fs_info->delayed_root;
-                atomic_dec(&delayed_root->items);
+                if (atomic_dec_return(&delayed_root->items) <
-                if (atomic_read(&delayed_root->items) <
                    BTRFS_DELAYED_BACKGROUND &&
                    waitqueue_active(&delayed_root->wait))
                        wake_up(&delayed_root->wait);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index da7419ed01bb..ae9411773397 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -38,17 +38,14 @@
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
                          struct btrfs_delayed_tree_ref *ref1)
 {
-        if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+        if (ref1->root < ref2->root)
-                if (ref1->root < ref2->root)
+                return -1;
-                        return -1;
+        if (ref1->root > ref2->root)
-                if (ref1->root > ref2->root)
+                return 1;
-                        return 1;
+        if (ref1->parent < ref2->parent)
-        } else {
+                return -1;
-                if (ref1->parent < ref2->parent)
+        if (ref1->parent > ref2->parent)
-                        return -1;
+                return 1;
-                if (ref1->parent > ref2->parent)
-                        return 1;
-        }
        return 0;
 }
@@ -85,7 +82,8 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
 * type of the delayed backrefs and content of delayed backrefs.
 */
 static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-                      struct btrfs_delayed_ref_node *ref1)
+                      struct btrfs_delayed_ref_node *ref1,
+                      bool compare_seq)
 {
        if (ref1->bytenr < ref2->bytenr)
                return -1;
@@ -102,10 +100,12 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
        if (ref1->type > ref2->type)
                return 1;
        /* merging of sequenced refs is not allowed */
-        if (ref1->seq < ref2->seq)
+        if (compare_seq) {
-                return -1;
+                if (ref1->seq < ref2->seq)
-        if (ref1->seq > ref2->seq)
+                        return -1;
-                return 1;
+                if (ref1->seq > ref2->seq)
+                        return 1;
+        }
        if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
            ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
                return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -139,7 +139,7 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
                                 rb_node);
-                cmp = comp_entry(entry, ins);
+                cmp = comp_entry(entry, ins, 1);
                if (cmp < 0)
                        p = &(*p)->rb_left;
                else if (cmp > 0)
@@ -233,6 +233,114 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
        return 0;
 }
+static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+                                    struct btrfs_delayed_ref_root *delayed_refs,
+                                    struct btrfs_delayed_ref_node *ref)
+{
+        rb_erase(&ref->rb_node, &delayed_refs->root);
+        ref->in_tree = 0;
+        btrfs_put_delayed_ref(ref);
+        delayed_refs->num_entries--;
+        if (trans->delayed_ref_updates)
+                trans->delayed_ref_updates--;
+}
+static int merge_ref(struct btrfs_trans_handle *trans,
+                     struct btrfs_delayed_ref_root *delayed_refs,
+                     struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+        struct rb_node *node;
+        int merged = 0;
+        int mod = 0;
+        int done = 0;
+        node = rb_prev(&ref->rb_node);
+        while (node) {
+                struct btrfs_delayed_ref_node *next;
+                next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                node = rb_prev(node);
+                if (next->bytenr != ref->bytenr)
+                        break;
+                if (seq && next->seq >= seq)
+                        break;
+                if (comp_entry(ref, next, 0))
+                        continue;
+                if (ref->action == next->action) {
+                        mod = next->ref_mod;
+                } else {
+                        if (ref->ref_mod < next->ref_mod) {
+                                struct btrfs_delayed_ref_node *tmp;
+                                tmp = ref;
+                                ref = next;
+                                next = tmp;
+                                done = 1;
+                        }
+                        mod = -next->ref_mod;
+                }
+                merged++;
+                drop_delayed_ref(trans, delayed_refs, next);
+                ref->ref_mod += mod;
+                if (ref->ref_mod == 0) {
+                        drop_delayed_ref(trans, delayed_refs, ref);
+                        break;
+                } else {
+                        /*
+                         * You can't have multiples of the same ref on a tree
+                         * block.
+                         */
+                        WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+                                ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+                }
+                if (done)
+                        break;
+                node = rb_prev(&ref->rb_node);
+        }
+        return merged;
+}
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_delayed_ref_root *delayed_refs,
+                              struct btrfs_delayed_ref_head *head)
+{
+        struct rb_node *node;
+        u64 seq = 0;
+        spin_lock(&fs_info->tree_mod_seq_lock);
+        if (!list_empty(&fs_info->tree_mod_seq_list)) {
+                struct seq_list *elem;
+                elem = list_first_entry(&fs_info->tree_mod_seq_list,
+                                        struct seq_list, list);
+                seq = elem->seq;
+        }
+        spin_unlock(&fs_info->tree_mod_seq_lock);
+        node = rb_prev(&head->node.rb_node);
+        while (node) {
+                struct btrfs_delayed_ref_node *ref;
+                ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                               rb_node);
+                if (ref->bytenr != head->node.bytenr)
+                        break;
+                /* We can't merge refs that are outside of our seq count */
+                if (seq && ref->seq >= seq)
+                        break;
+                if (merge_ref(trans, delayed_refs, ref, seq))
+                        node = rb_prev(&head->node.rb_node);
+                else
+                        node = rb_prev(node);
+        }
+}
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                            struct btrfs_delayed_ref_root *delayed_refs,
                            u64 seq)
@@ -336,18 +444,11 @@ update_existing_ref(struct btrfs_trans_handle *trans,
                 * every changing the extent allocation tree.
                 */
                existing->ref_mod--;
-                if (existing->ref_mod == 0) {
+                if (existing->ref_mod == 0)
-                        rb_erase(&existing->rb_node,
+                        drop_delayed_ref(trans, delayed_refs, existing);
-                                 &delayed_refs->root);
+                else
-                        existing->in_tree = 0;
-                        btrfs_put_delayed_ref(existing);
-                        delayed_refs->num_entries--;
-                        if (trans->delayed_ref_updates)
-                                trans->delayed_ref_updates--;
-                } else {
                        WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
                                existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-                }
        } else {
                WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
                        existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -662,9 +763,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
                                   for_cow);
-        if (!need_ref_seq(for_cow, ref_root) &&
-            waitqueue_active(&fs_info->tree_mod_seq_wait))
-                wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
        if (need_ref_seq(for_cow, ref_root))
                btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -713,9 +811,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
                                   action, for_cow);
-        if (!need_ref_seq(for_cow, ref_root) &&
-            waitqueue_active(&fs_info->tree_mod_seq_wait))
-                wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
        if (need_ref_seq(for_cow, ref_root))
                btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
@@ -744,8 +839,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
                                   extent_op->is_data);
-        if (waitqueue_active(&fs_info->tree_mod_seq_wait))
-                wake_up(&fs_info->tree_mod_seq_wait);
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 0d7c90c366b6..ab5300595847 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,6 +167,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
                                struct btrfs_trans_handle *trans,
                                u64 bytenr, u64 num_bytes,
                                struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+                              struct btrfs_fs_info *fs_info,
+                              struct btrfs_delayed_ref_root *delayed_refs,
+                              struct btrfs_delayed_ref_head *head);
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fadeba6a5db9..22e98e04c2ea 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -377,9 +377,13 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                ret = read_extent_buffer_pages(io_tree, eb, start,
                                               WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
-                if (!ret && !verify_parent_transid(io_tree, eb,
+                if (!ret) {
+                        if (!verify_parent_transid(io_tree, eb,
                                                   parent_transid, 0))
-                        break;
+                                break;
+                        else
+                                ret = -EIO;
+                }
                /*
                 * This buffer's crc is fine, but its contents are corrupted, so
@@ -754,9 +758,7 @@ static void run_one_async_done(struct btrfs_work *work)
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
-        atomic_dec(&fs_info->nr_async_submits);
+        if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
-        if (atomic_read(&fs_info->nr_async_submits) < limit &&
            waitqueue_active(&fs_info->async_submit_wait))
                wake_up(&fs_info->async_submit_wait);
@@ -1614,8 +1616,6 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        do {
-                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
                    mutex_trylock(&root->fs_info->cleaner_mutex)) {
                        btrfs_run_delayed_iputs(root);
@@ -1647,7 +1647,6 @@ static int transaction_kthread(void *arg)
        do {
                cannot_commit = false;
                delay = HZ * 30;
-                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
                spin_lock(&root->fs_info->trans_lock);
@@ -2035,8 +2034,6 @@ int open_ctree(struct super_block *sb,
        fs_info->free_chunk_space = 0;
        fs_info->tree_mod_log = RB_ROOT;
-        init_waitqueue_head(&fs_info->tree_mod_seq_wait);
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
        spin_lock_init(&fs_info->reada_lock);
@@ -2531,8 +2528,7 @@ retry_root_backup:
                goto fail_trans_kthread;
        /* do not make disk changes in broken FS */
-        if (btrfs_super_log_root(disk_super) != 0 &&
+        if (btrfs_super_log_root(disk_super) != 0) {
-            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
                if (fs_devices->rw_devices == 0) {
@@ -3192,30 +3188,14 @@ int close_ctree(struct btrfs_root *root)
        /* clear out the rbtree of defraggable inodes */
        btrfs_run_defrag_inodes(fs_info);
-        /*
-         * Here come 2 situations when btrfs is broken to flip readonly:
-         *
-         * 1. when btrfs flips readonly somewhere else before
-         * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
-         * and btrfs will skip to write sb directly to keep
-         * ERROR state on disk.
-         *
-         * 2. when btrfs flips readonly just in btrfs_commit_super,
-         * and in such case, btrfs cannot write sb via btrfs_commit_super,
-         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
-         * btrfs will cleanup all FS resources first and write sb then.
-         */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret = btrfs_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
-        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
-                ret = btrfs_error_commit_super(root);
+                btrfs_error_commit_super(root);
-                if (ret)
-                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
-        }
        btrfs_put_block_group_cache(fs_info);
@@ -3437,18 +3417,11 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        if (read_only)
                return 0;
-        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-                printk(KERN_WARNING "warning: mount fs with errors, "
-                       "running btrfsck is recommended\n");
-        }
        return 0;
 }
-int btrfs_error_commit_super(struct btrfs_root *root)
+void btrfs_error_commit_super(struct btrfs_root *root)
 {
-        int ret;
        mutex_lock(&root->fs_info->cleaner_mutex);
        btrfs_run_delayed_iputs(root);
        mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3458,10 +3431,6 @@ int btrfs_error_commit_super(struct btrfs_root *root)
        /* cleanup FS via transaction */
        btrfs_cleanup_transaction(root);
-        ret = write_ctree_super(NULL, root, 0);
-        return ret;
 }
 static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
@@ -3785,14 +3754,17 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
                /* FIXME: cleanup wait for commit */
                t->in_commit = 1;
                t->blocked = 1;
+                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
                        wake_up(&root->fs_info->transaction_blocked_wait);
                t->blocked = 0;
+                smp_mb();
                if (waitqueue_active(&root->fs_info->transaction_wait))
                        wake_up(&root->fs_info->transaction_wait);
                t->commit_done = 1;
+                smp_mb();
                if (waitqueue_active(&t->commit_wait))
                        wake_up(&t->commit_wait);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95e147eea239..c5b00a735fef 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -54,7 +54,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
+void btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4e1b153b7c47..ba58024d40d3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2252,6 +2252,16 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                }
                /*
+                 * We need to try and merge add/drops of the same ref since we
+                 * can run into issues with relocate dropping the implicit ref
+                 * and then it being added back again before the drop can
+                 * finish.  If we merged anything we need to re-loop so we can
+                 * get a good ref.
+                 */
+                btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+                                         locked_ref);
+                /*
                 * locked_ref is the head node, so we have to go one
                 * node back for any delayed ref updates
                 */
@@ -2318,12 +2328,23 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-                /*
+                if (locked_ref) {
-                 * we modified num_entries, but as we're currently running
+                        /*
-                 * delayed refs, skip
+                         * when we play the delayed ref, also correct the
-                 *     wake_up(&delayed_refs->seq_wait);
+                         * ref_mod on head
-                 * here.
+                         */
-                 */
+                        switch (ref->action) {
+                        case BTRFS_ADD_DELAYED_REF:
+                        case BTRFS_ADD_DELAYED_EXTENT:
+                                locked_ref->node.ref_mod -= ref->ref_mod;
+                                break;
+                        case BTRFS_DROP_DELAYED_REF:
+                                locked_ref->node.ref_mod += ref->ref_mod;
+                                break;
+                        default:
+                                WARN_ON(1);
+                        }
+                }
                spin_unlock(&delayed_refs->lock);
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2350,22 +2371,6 @@ next:
        return count;
 }
-static void wait_for_more_refs(struct btrfs_fs_info *fs_info,
-                               struct btrfs_delayed_ref_root *delayed_refs,
-                               unsigned long num_refs,
-                               struct list_head *first_seq)
-{
-        spin_unlock(&delayed_refs->lock);
-        pr_debug("waiting for more refs (num %ld, first %p)\n",
-                 num_refs, first_seq);
-        wait_event(fs_info->tree_mod_seq_wait,
-                   num_refs != delayed_refs->num_entries ||
-                   fs_info->tree_mod_seq_list.next != first_seq);
-        pr_debug("done waiting for more refs (num %ld, first %p)\n",
-                 delayed_refs->num_entries, fs_info->tree_mod_seq_list.next);
-        spin_lock(&delayed_refs->lock);
-}
 #ifdef SCRAMBLE_DELAYED_REFS
 /*
 * Normally delayed refs get processed in ascending bytenr order. This
@@ -2460,13 +2465,11 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct list_head cluster;
-        struct list_head *first_seq = NULL;
        int ret;
        u64 delayed_start;
        int run_all = count == (unsigned long)-1;
        int run_most = 0;
-        unsigned long num_refs = 0;
+        int loops;
-        int consider_waiting;
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2484,7 +2487,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
 again:
-        consider_waiting = 0;
+        loops = 0;
        spin_lock(&delayed_refs->lock);
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -2512,31 +2515,6 @@ again:
                if (ret)
                        break;
-                if (delayed_start >= delayed_refs->run_delayed_start) {
-                        if (consider_waiting == 0) {
-                                /*
-                                 * btrfs_find_ref_cluster looped. let's do one
-                                 * more cycle. if we don't run any delayed ref
-                                 * during that cycle (because we can't because
-                                 * all of them are blocked) and if the number of
-                                 * refs doesn't change, we avoid busy waiting.
-                                 */
-                                consider_waiting = 1;
-                                num_refs = delayed_refs->num_entries;
-                                first_seq = root->fs_info->tree_mod_seq_list.next;
-                        } else {
-                                wait_for_more_refs(root->fs_info, delayed_refs,
-                                                   num_refs, first_seq);
-                                /*
-                                 * after waiting, things have changed. we
-                                 * dropped the lock and someone else might have
-                                 * run some refs, built new clusters and so on.
-                                 * therefore, we restart staleness detection.
-                                 */
-                                consider_waiting = 0;
-                        }
-                }
                ret = run_clustered_refs(trans, root, &cluster);
                if (ret < 0) {
                        spin_unlock(&delayed_refs->lock);
@@ -2549,9 +2527,26 @@ again:
                if (count == 0)
                        break;
-                if (ret || delayed_refs->run_delayed_start == 0) {
+                if (delayed_start >= delayed_refs->run_delayed_start) {
+                        if (loops == 0) {
+                                /*
+                                 * btrfs_find_ref_cluster looped. let's do one
+                                 * more cycle. if we don't run any delayed ref
+                                 * during that cycle (because we can't because
+                                 * all of them are blocked), bail out.
+                                 */
+                                loops = 1;
+                        } else {
+                                /*
+                                 * no runnable refs left, stop trying
+                                 */
+                                BUG_ON(run_all);
+                                break;
+                        }
+                }
+                if (ret) {
                        /* refs were run, let's reset staleness detection */
-                        consider_waiting = 0;
+                        loops = 0;
                }
        }
@@ -3007,17 +3002,16 @@ again:
        }
        spin_unlock(&block_group->lock);
-        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+        /*
+         * Try to preallocate enough space based on how big the block group is.
+         * Keep in mind this has to include any pinned space which could end up
+         * taking up quite a bit since it's not folded into the other space
+         * cache.
+         */
+        num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
        if (!num_pages)
                num_pages = 1;
-        /*
-         * Just to make absolutely sure we have enough space, we're going to
-         * preallocate 12 pages worth of space for each block group.  In
-         * practice we ought to use at most 8, but we need extra space so we can
-         * add our header and have a terminator between the extents and the
-         * bitmaps.
-         */
        num_pages *= 16;
        num_pages *= PAGE_CACHE_SIZE;
@@ -4571,8 +4565,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        if (root->fs_info->quota_enabled) {
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
-                if (ret)
+                if (ret) {
+                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
                        return ret;
+                }
        }
        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
@@ -5294,9 +5290,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        rb_erase(&head->node.rb_node, &delayed_refs->root);
        delayed_refs->num_entries--;
-        smp_mb();
-        if (waitqueue_active(&root->fs_info->tree_mod_seq_wait))
-                wake_up(&root->fs_info->tree_mod_seq_wait);
        /*
         * we don't take a ref on the node because we're removing it from the
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 45c81bb4ac82..4c878476bb91 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2330,23 +2330,10 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
                                                              state, mirror);
-                        if (ret) {
+                        if (ret)
-                                /* no IO indicated but software detected errors
-                                 * in the block, either checksum errors or
-                                 * issues with the contents */
-                                struct btrfs_root *root =
-                                        BTRFS_I(page->mapping->host)->root;
-                                struct btrfs_device *device;
                                uptodate = 0;
-                                device = btrfs_find_device_for_logical(
+                        else
-                                                root, start, mirror);
-                                if (device)
-                                        btrfs_dev_stat_inc_and_print(device,
-                                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
-                        } else {
                                clean_io_failure(start, page);
-                        }
                }
                if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index b45b9de0c21d..857d93cd01dc 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -272,9 +272,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 }
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-                              struct bio *bio, u64 offset, u32 *dst)
+                              struct bio *bio, u64 offset)
 {
-        return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
 }
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9aa01ec2138d..5caf285c6e4d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1379,7 +1379,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
        ssize_t err = 0;
        size_t count, ocount;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
@@ -1469,6 +1469,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                        num_written = err;
        }
 out:
+        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 48bdfd2591c2..ec154f954646 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -324,7 +324,8 @@ static noinline int add_async_extent(struct async_cow *cow,
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that pdflush sent them down.
+ * are written in the same order that the flusher thread sent them
+ * down.
 */
 static noinline int compress_file_range(struct inode *inode,
                                        struct page *locked_page,
@@ -1007,9 +1008,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
                PAGE_CACHE_SHIFT;
-        atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
+        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
-        if (atomic_read(&root->fs_info->async_delalloc_pages) <
            5 * 1024 * 1024 &&
            waitqueue_active(&root->fs_info->async_submit_wait))
                wake_up(&root->fs_info->async_submit_wait);
@@ -1884,8 +1883,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                                trans = btrfs_join_transaction_nolock(root);
                        else
                                trans = btrfs_join_transaction(root);
-                        if (IS_ERR(trans))
+                        if (IS_ERR(trans)) {
-                                return PTR_ERR(trans);
+                                ret = PTR_ERR(trans);
+                                trans = NULL;
+                                goto out;
+                        }
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode_fallback(trans, root, inode);
                        if (ret) /* -ENOMEM or corruption */
@@ -3173,7 +3175,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode_inc_iversion(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-        ret = btrfs_update_inode(trans, root, dir);
+        ret = btrfs_update_inode_fallback(trans, root, dir);
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
 out:
@@ -5773,18 +5775,112 @@ out:
        return ret;
 }
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+                              struct extent_state **cached_state, int writing)
+{
+        struct btrfs_ordered_extent *ordered;
+        int ret = 0;
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, cached_state);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                /*
+                 * We need to make sure there are no buffered pages in this
+                 * range either, we could have raced between the invalidate in
+                 * generic_file_direct_write and locking the extent.  The
+                 * invalidate needs to happen so that reads after a write do not
+                 * get stale data.
+                 */
+                if (!ordered && (!writing ||
+                    !test_range_bit(&BTRFS_I(inode)->io_tree,
+                                    lockstart, lockend, EXTENT_UPTODATE, 0,
+                                    *cached_state)))
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     cached_state, GFP_NOFS);
+                if (ordered) {
+                        btrfs_start_ordered_extent(inode, ordered, 1);
+                        btrfs_put_ordered_extent(ordered);
+                } else {
+                        /* Screw you mmap */
+                        ret = filemap_write_and_wait_range(inode->i_mapping,
+                                                           lockstart,
+                                                           lockend);
+                        if (ret)
+                                break;
+                        /*
+                         * If we found a page that couldn't be invalidated just
+                         * fall back to buffered.
+                         */
+                        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                        lockstart >> PAGE_CACHE_SHIFT,
+                                        lockend >> PAGE_CACHE_SHIFT);
+                        if (ret)
+                                break;
+                }
+                cond_resched();
+        }
+        return ret;
+}
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct extent_state *cached_state = NULL;
        u64 start = iblock << inode->i_blkbits;
+        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
+        int unlock_bits = EXTENT_LOCKED;
+        int ret;
+        if (create) {
+                ret = btrfs_delalloc_reserve_space(inode, len);
+                if (ret)
+                        return ret;
+                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+        } else {
+                len = min_t(u64, len, root->sectorsize);
+        }
+        lockstart = start;
+        lockend = start + len - 1;
+        /*
+         * If this errors out it's because we couldn't invalidate pagecache for
+         * this range and we need to fallback to buffered.
+         */
+        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+                return -ENOTBLK;
+        if (create) {
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockend, EXTENT_DELALLOC, NULL,
+                                     &cached_state, GFP_NOFS);
+                if (ret)
+                        goto unlock_err;
+        }
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-        if (IS_ERR(em))
+        if (IS_ERR(em)) {
-                return PTR_ERR(em);
+                ret = PTR_ERR(em);
+                goto unlock_err;
+        }
        /*
         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5803,17 +5899,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
            em->block_start == EXTENT_MAP_INLINE) {
                free_extent_map(em);
-                return -ENOTBLK;
+                ret = -ENOTBLK;
+                goto unlock_err;
        }
        /* Just a good old fashioned hole, return */
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-                /* DIO will do one hole at a time, so just unlock a sector */
+                ret = 0;
-                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                goto unlock_err;
-                              start + root->sectorsize - 1);
-                return 0;
        }
        /*
@@ -5826,8 +5921,9 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         *
         */
        if (!create) {
-                len = em->len - (start - em->start);
+                len = min(len, em->len - (start - em->start));
-                goto map;
+                lockstart = start + len;
+                goto unlock;
        }
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5859,7 +5955,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                        btrfs_end_transaction(trans, root);
                        if (ret) {
                                free_extent_map(em);
-                                return ret;
+                                goto unlock_err;
                        }
                        goto unlock;
                }
@@ -5872,14 +5968,12 @@ must_cow:
         */
        len = bh_result->b_size;
        em = btrfs_new_extent_direct(inode, em, start, len);
-        if (IS_ERR(em))
+        if (IS_ERR(em)) {
-                return PTR_ERR(em);
+                ret = PTR_ERR(em);
+                goto unlock_err;
+        }
        len = min(len, em->len - (start - em->start));
 unlock:
-        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-                          0, NULL, GFP_NOFS);
-map:
        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
                inode->i_blkbits;
        bh_result->b_size = len;
@@ -5897,9 +5991,44 @@ map:
                        i_size_write(inode, start + len);
        }
+        /*
+         * In the case of write we need to clear and unlock the entire range,
+         * in the case of read we need to unlock only the end area that we
+         * aren't using if there is any left over space.
+         */
+        if (lockstart < lockend) {
+                if (create && len < lockend - lockstart) {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockstart + len - 1, unlock_bits, 1, 0,
+                                         &cached_state, GFP_NOFS);
+                        /*
+                         * Beside unlock, we also need to cleanup reserved space
+                         * for the left range by attaching EXTENT_DO_ACCOUNTING.
+                         */
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                                         lockstart + len, lockend,
+                                         unlock_bits | EXTENT_DO_ACCOUNTING,
+                                         1, 0, NULL, GFP_NOFS);
+                } else {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockend, unlock_bits, 1, 0,
+                                         &cached_state, GFP_NOFS);
+                }
+        } else {
+                free_extent_state(cached_state);
+        }
        free_extent_map(em);
        return 0;
+unlock_err:
+        if (create)
+                unlock_bits |= EXTENT_DO_ACCOUNTING;
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+        return ret;
 }
 struct btrfs_dio_private {
@@ -5907,7 +6036,6 @@ struct btrfs_dio_private {
        u64 logical_offset;
        u64 disk_bytenr;
        u64 bytes;
-        u32 *csums;
        void *private;
        /* number of bios pending for this dio */
@@ -5927,7 +6055,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
-        u32 *private = dip->csums;
        start = dip->logical_offset;
        do {
@@ -5935,8 +6062,12 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        struct page *page = bvec->bv_page;
                        char *kaddr;
                        u32 csum = ~(u32)0;
+                        u64 private = ~(u32)0;
                        unsigned long flags;
+                        if (get_state_private(&BTRFS_I(inode)->io_tree,
+                                              start, &private))
+                                goto failed;
                        local_irq_save(flags);
                        kaddr = kmap_atomic(page);
                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
@@ -5946,18 +6077,18 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        local_irq_restore(flags);
                        flush_dcache_page(bvec->bv_page);
-                        if (csum != *private) {
+                        if (csum != private) {
+failed:
                                printk(KERN_ERR "btrfs csum failed ino %llu off"
                                      " %llu csum %u private %u\n",
                                      (unsigned long long)btrfs_ino(inode),
                                      (unsigned long long)start,
-                                      csum, *private);
+                                      csum, (unsigned)private);
                                err = -EIO;
                        }
                }
                start += bvec->bv_len;
-                private++;
                bvec++;
        } while (bvec <= bvec_end);
@@ -5965,7 +6096,6 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                      dip->logical_offset + dip->bytes - 1);
        bio->bi_private = dip->private;
-        kfree(dip->csums);
        kfree(dip);
        /* If we had a csum failure make sure to clear the uptodate flag */
@@ -6071,7 +6201,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                                         int rw, u64 file_offset, int skip_sum,
-                                         u32 *csums, int async_submit)
+                                         int async_submit)
 {
        int write = rw & REQ_WRITE;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -6104,8 +6234,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
                if (ret)
                        goto err;
        } else if (!skip_sum) {
-                ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
+                ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
-                                          file_offset, csums);
                if (ret)
                        goto err;
        }
@@ -6131,10 +6260,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        u64 submit_len = 0;
        u64 map_length;
        int nr_pages = 0;
-        u32 *csums = dip->csums;
        int ret = 0;
        int async_submit = 0;
-        int write = rw & REQ_WRITE;
        map_length = orig_bio->bi_size;
        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -6170,16 +6297,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        atomic_inc(&dip->pending_bios);
                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
                                                     file_offset, skip_sum,
-                                                     csums, async_submit);
+                                                     async_submit);
                        if (ret) {
                                bio_put(bio);
                                atomic_dec(&dip->pending_bios);
                                goto out_err;
                        }
-                        /* Write's use the ordered csums */
-                        if (!write && !skip_sum)
-                                csums = csums + nr_pages;
                        start_sector += submit_len >> 9;
                        file_offset += submit_len;
@@ -6209,7 +6333,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 submit:
        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-                                     csums, async_submit);
+                                     async_submit);
        if (!ret)
                return 0;
@@ -6245,17 +6369,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                ret = -ENOMEM;
                goto free_ordered;
        }
-        dip->csums = NULL;
-        /* Write's use the ordered csum stuff, so we don't need dip->csums */
-        if (!write && !skip_sum) {
-                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
-                if (!dip->csums) {
-                        kfree(dip);
-                        ret = -ENOMEM;
-                        goto free_ordered;
-                }
-        }
        dip->private = bio->bi_private;
        dip->inode = inode;
@@ -6340,132 +6453,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
 out:
        return retval;
 }
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct btrfs_ordered_extent *ordered;
-        struct extent_state *cached_state = NULL;
-        u64 lockstart, lockend;
-        ssize_t ret;
-        int writing = rw & WRITE;
-        int write_bits = 0;
-        size_t count = iov_length(iov, nr_segs);
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-                            offset, nr_segs)) {
+                            offset, nr_segs))
                return 0;
-        }
-        lockstart = offset;
+        return __blockdev_direct_IO(rw, iocb, inode,
-        lockend = offset + count - 1;
-        if (writing) {
-                ret = btrfs_delalloc_reserve_space(inode, count);
-                if (ret)
-                        goto out;
-        }
-        while (1) {
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                 0, &cached_state);
-                /*
-                 * We're concerned with the entire range that we're going to be
-                 * doing DIO to, so we need to make sure theres no ordered
-                 * extents in this range.
-                 */
-                ordered = btrfs_lookup_ordered_range(inode, lockstart,
-                                                     lockend - lockstart + 1);
-                /*
-                 * We need to make sure there are no buffered pages in this
-                 * range either, we could have raced between the invalidate in
-                 * generic_file_direct_write and locking the extent.  The
-                 * invalidate needs to happen so that reads after a write do not
-                 * get stale data.
-                 */
-                if (!ordered && (!writing ||
-                    !test_range_bit(&BTRFS_I(inode)->io_tree,
-                                    lockstart, lockend, EXTENT_UPTODATE, 0,
-                                    cached_state)))
-                        break;
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                     &cached_state, GFP_NOFS);
-                if (ordered) {
-                        btrfs_start_ordered_extent(inode, ordered, 1);
-                        btrfs_put_ordered_extent(ordered);
-                } else {
-                        /* Screw you mmap */
-                        ret = filemap_write_and_wait_range(file->f_mapping,
-                                                           lockstart,
-                                                           lockend);
-                        if (ret)
-                                goto out;
-                        /*
-                         * If we found a page that couldn't be invalidated just
-                         * fall back to buffered.
-                         */
-                        ret = invalidate_inode_pages2_range(file->f_mapping,
-                                        lockstart >> PAGE_CACHE_SHIFT,
-                                        lockend >> PAGE_CACHE_SHIFT);
-                        if (ret) {
-                                if (ret == -EBUSY)
-                                        ret = 0;
-                                goto out;
-                        }
-                }
-                cond_resched();
-        }
-        /*
-         * we don't use btrfs_set_extent_delalloc because we don't want
-         * the dirty or uptodate bits
-         */
-        if (writing) {
-                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                     EXTENT_DELALLOC, NULL, &cached_state,
-                                     GFP_NOFS);
-                if (ret) {
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                         lockend, EXTENT_LOCKED | write_bits,
-                                         1, 0, &cached_state, GFP_NOFS);
-                        goto out;
-                }
-        }
-        free_extent_state(cached_state);
-        cached_state = NULL;
-        ret = __blockdev_direct_IO(rw, iocb, inode,
                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
                   btrfs_submit_direct, 0);
-        if (ret < 0 && ret != -EIOCBQUEUED) {
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-                              offset + iov_length(iov, nr_segs) - 1,
-                              EXTENT_LOCKED | write_bits, 1, 0,
-                              &cached_state, GFP_NOFS);
-        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-                /*
-                 * We're falling back to buffered, unlock the section we didn't
-                 * do IO on.
-                 */
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-                              offset + iov_length(iov, nr_segs) - 1,
-                              EXTENT_LOCKED | write_bits, 1, 0,
-                              &cached_state, GFP_NOFS);
-        }
-out:
-        free_extent_state(cached_state);
-        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -6629,6 +6632,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
+        sb_start_pagefault(inode->i_sb);
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
@@ -6718,12 +6722,15 @@ again:
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
-        if (!ret)
+        if (!ret) {
+                sb_end_pagefault(inode->i_sb);
                return VM_FAULT_LOCKED;
+        }
        unlock_page(page);
 out:
        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out_noreserve:
+        sb_end_pagefault(inode->i_sb);
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 43f0012016e3..9df50fa8a078 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -195,6 +195,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        if (!inode_owner_or_capable(inode))
                return -EACCES;
+        ret = mnt_want_write_file(file);
+        if (ret)
+                return ret;
        mutex_lock(&inode->i_mutex);
        ip_oldflags = ip->flags;
@@ -209,10 +213,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                }
        }
-        ret = mnt_want_write_file(file);
-        if (ret)
-                goto out_unlock;
        if (flags & FS_SYNC_FL)
                ip->flags |= BTRFS_INODE_SYNC;
        else
@@ -275,9 +275,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                inode->i_flags = i_oldflags;
        }
-        mnt_drop_write_file(file);
 out_unlock:
        mutex_unlock(&inode->i_mutex);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -424,7 +424,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        uuid_le_gen(&new_uuid);
        memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
        root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
-        root_item.otime.nsec = cpu_to_le64(cur_time.tv_nsec);
+        root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
        root_item.ctime = root_item.otime;
        btrfs_set_root_ctransid(&root_item, trans->transid);
        btrfs_set_root_otransid(&root_item, trans->transid);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index a44eff074805..2a1762c66041 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -67,7 +67,7 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
        if (eb->lock_nested) {
                read_lock(&eb->lock);
-                if (&eb->lock_nested && current->pid == eb->lock_owner) {
+                if (eb->lock_nested && current->pid == eb->lock_owner) {
                        read_unlock(&eb->lock);
                        return;
                }
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 643335a4fe3c..051c7fe551dd 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -596,7 +596,7 @@ void btrfs_start_ordered_extent(struct inode *inode,
        /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
-         * for pdflush to find them
+         * for the flusher thread to find them
         */
        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
                filemap_fdatawrite_range(inode->i_mapping, start, end);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bc424ae5a81a..38b42e7bc91d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1364,13 +1364,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
        spin_lock(&fs_info->qgroup_lock);
        dstgroup = add_qgroup_rb(fs_info, objectid);
-        if (!dstgroup)
+        if (IS_ERR(dstgroup)) {
+                ret = PTR_ERR(dstgroup);
                goto unlock;
+        }
        if (srcid) {
                srcgroup = find_qgroup_rb(fs_info, srcid);
-                if (!srcgroup)
+                if (!srcgroup) {
+                        ret = -EINVAL;
                        goto unlock;
+                }
                dstgroup->rfer = srcgroup->rfer - level_size;
                dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
                srcgroup->excl = level_size;
@@ -1379,8 +1383,10 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
                qgroup_dirty(fs_info, srcgroup);
        }
-        if (!inherit)
+        if (!inherit) {
+                ret = -EINVAL;
                goto unlock;
+        }
        i_qgroups = (u64 *)(inherit + 1);
        for (i = 0; i < inherit->num_qgroups; ++i) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6bb465cca20f..10d8e4d88071 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -544,8 +544,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
        struct timespec ct = CURRENT_TIME;
        spin_lock(&root->root_times_lock);
-        item->ctransid = trans->transid;
+        item->ctransid = cpu_to_le64(trans->transid);
        item->ctime.sec = cpu_to_le64(ct.tv_sec);
-        item->ctime.nsec = cpu_to_le64(ct.tv_nsec);
+        item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
        spin_unlock(&root->root_times_lock);
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8c6e61d6eed5..83d6f9f9c220 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -100,10 +100,6 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
 }
-/* NOTE:
- *      We move write_super stuff at umount in order to avoid deadlock
- *      for umount hold all lock.
- */
 static void save_error_info(struct btrfs_fs_info *fs_info)
 {
        __save_error_info(fs_info);
@@ -842,7 +838,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_trans_handle *trans;
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
        struct btrfs_root *root = fs_info->tree_root;
-        int ret;
        trace_btrfs_sync_fs(wait);
@@ -853,11 +848,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_wait_ordered_extents(root, 0, 0);
-        trans = btrfs_start_transaction(root, 0);
+        spin_lock(&fs_info->trans_lock);
+        if (!fs_info->running_transaction) {
+                spin_unlock(&fs_info->trans_lock);
+                return 0;
+        }
+        spin_unlock(&fs_info->trans_lock);
+        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        ret = btrfs_commit_transaction(trans, root);
+        return btrfs_commit_transaction(trans, root);
-        return ret;
 }
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -1534,6 +1535,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
        while (cur_devices) {
                head = &cur_devices->devices;
                list_for_each_entry(dev, head, dev_list) {
+                        if (dev->missing)
+                                continue;
                        if (!first_dev || dev->devid < first_dev->devid)
                                first_dev = dev;
                }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7ac7cdcc294e..27c26004e050 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -335,6 +335,8 @@ again:
        if (!h)
                return ERR_PTR(-ENOMEM);
+        sb_start_intwrite(root->fs_info->sb);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
@@ -345,6 +347,7 @@ again:
        } while (ret == -EBUSY);
        if (ret < 0) {
+                sb_end_intwrite(root->fs_info->sb);
                kmem_cache_free(btrfs_trans_handle_cachep, h);
                return ERR_PTR(ret);
        }
@@ -548,6 +551,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
+        sb_end_intwrite(root->fs_info->sb);
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
@@ -1026,6 +1031,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_i_size_write(parent_inode, parent_inode->i_size +
                                         dentry->d_name.len * 2);
+        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        if (ret)
                goto abort_trans_dput;
@@ -1061,7 +1067,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        memcpy(new_root_item->parent_uuid, root->root_item.uuid,
                        BTRFS_UUID_SIZE);
        new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
-        new_root_item->otime.nsec = cpu_to_le64(cur_time.tv_nsec);
+        new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
        btrfs_set_root_otransid(new_root_item, trans->transid);
        memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
        memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
@@ -1578,6 +1584,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        put_transaction(cur_trans);
        put_transaction(cur_trans);
+        sb_end_intwrite(root->fs_info->sb);
        trace_btrfs_transaction_commit(root);
        btrfs_scrub_continue(root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b8708f994e67..88b969aeeb71 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -227,9 +227,8 @@ loop_lock:
                cur = pending;
                pending = pending->bi_next;
                cur->bi_next = NULL;
-                atomic_dec(&fs_info->nr_async_bios);
-                if (atomic_read(&fs_info->nr_async_bios) < limit &&
+                if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
                    waitqueue_active(&fs_info->async_submit_wait))
                        wake_up(&fs_info->async_submit_wait);
@@ -569,9 +568,11 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                memcpy(new_device, device, sizeof(*new_device));
                /* Safe because we are under uuid_mutex */
-                name = rcu_string_strdup(device->name->str, GFP_NOFS);
+                if (device->name) {
-                BUG_ON(device->name && !name); /* -ENOMEM */
+                        name = rcu_string_strdup(device->name->str, GFP_NOFS);
-                rcu_assign_pointer(new_device->name, name);
+                        BUG_ON(device->name && !name); /* -ENOMEM */
+                        rcu_assign_pointer(new_device->name, name);
+                }
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
@@ -1744,10 +1745,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->fs_devices = root->fs_info->fs_devices;
-        /*
-         * we don't want write_supers to jump in here with our device
-         * half setup
-         */
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
@@ -4609,28 +4606,6 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        return ret;
 }
-struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
-                                                   u64 logical, int mirror_num)
-{
-        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
-        int ret;
-        u64 map_length = 0;
-        struct btrfs_bio *bbio = NULL;
-        struct btrfs_device *device;
-        BUG_ON(mirror_num == 0);
-        ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
-                              mirror_num);
-        if (ret) {
-                BUG_ON(bbio != NULL);
-                return NULL;
-        }
-        BUG_ON(mirror_num != bbio->mirror_num);
-        device = bbio->stripes[mirror_num - 1].dev;
-        kfree(bbio);
-        return device;
-}
 int btrfs_read_chunk_tree(struct btrfs_root *root)
 {
        struct btrfs_path *path;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5479325987b3..53c06af92e8d 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -289,8 +289,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
-struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
-                                                   u64 logical, int mirror_num);
 void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
 int btrfs_get_dev_stats(struct btrfs_root *root,
diff --git a/fs/buffer.c b/fs/buffer.c
index c7062c896d7c..58e2e7b77372 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -914,7 +914,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
 /*
 * Initialise the state of a blockdev page's buffers.
 */ 
-static void
+static sector_t
 init_page_buffers(struct page *page, struct block_device *bdev,
                        sector_t block, int size)
 {
@@ -936,33 +936,41 @@ init_page_buffers(struct page *page, struct block_device *bdev,
                block++;
                bh = bh->b_this_page;
        } while (bh != head);
+        /*
+         * Caller needs to validate requested block against end of device.
+         */
+        return end_block;
 }
 /*
 * Create the page-cache page that contains the requested block.
 *
- * This is user purely for blockdev mappings.
+ * This is used purely for blockdev mappings.
 */
-static struct page *
+static int
 grow_dev_page(struct block_device *bdev, sector_t block,
-                pgoff_t index, int size)
+                pgoff_t index, int size, int sizebits)
 {
        struct inode *inode = bdev->bd_inode;
        struct page *page;
        struct buffer_head *bh;
+        sector_t end_block;
+        int ret = 0;            /* Will call free_more_memory() */
        page = find_or_create_page(inode->i_mapping, index,
                (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
        if (!page)
-                return NULL;
+                return ret;
        BUG_ON(!PageLocked(page));
        if (page_has_buffers(page)) {
                bh = page_buffers(page);
                if (bh->b_size == size) {
-                        init_page_buffers(page, bdev, block, size);
+                        end_block = init_page_buffers(page, bdev,
-                        return page;
+                                                index << sizebits, size);
+                        goto done;
                }
                if (!try_to_free_buffers(page))
                        goto failed;
@@ -982,14 +990,14 @@ grow_dev_page(struct block_device *bdev, sector_t block,
         */
        spin_lock(&inode->i_mapping->private_lock);
        link_dev_buffers(page, bh);
-        init_page_buffers(page, bdev, block, size);
+        end_block = init_page_buffers(page, bdev, index << sizebits, size);
        spin_unlock(&inode->i_mapping->private_lock);
-        return page;
+done:
+        ret = (block < end_block) ? 1 : -ENXIO;
 failed:
        unlock_page(page);
        page_cache_release(page);
-        return NULL;
+        return ret;
 }
 /*
@@ -999,7 +1007,6 @@ failed:
 static int
 grow_buffers(struct block_device *bdev, sector_t block, int size)
 {
-        struct page *page;
        pgoff_t index;
        int sizebits;
@@ -1023,22 +1030,14 @@ grow_buffers(struct block_device *bdev, sector_t block, int size)
                        bdevname(bdev, b));
                return -EIO;
        }
-        block = index << sizebits;
        /* Create a page with the proper size buffers.. */
-        page = grow_dev_page(bdev, block, index, size);
+        return grow_dev_page(bdev, block, index, size, sizebits);
-        if (!page)
-                return 0;
-        unlock_page(page);
-        page_cache_release(page);
-        return 1;
 }
 static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
-        int ret;
-        struct buffer_head *bh;
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
@@ -1051,21 +1050,20 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
                return NULL;
        }
-retry:
+        for (;;) {
-        bh = __find_get_block(bdev, block, size);
+                struct buffer_head *bh;
-        if (bh)
+                int ret;
-                return bh;
-        ret = grow_buffers(bdev, block, size);
-        if (ret == 0) {
-                free_more_memory();
-                goto retry;
-        } else if (ret > 0) {
                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;
+                ret = grow_buffers(bdev, block, size);
+                if (ret < 0)
+                        return NULL;
+                if (ret == 0)
+                        free_more_memory();
        }
-        return NULL;
 }
 /*
@@ -1321,10 +1319,6 @@ EXPORT_SYMBOL(__find_get_block);
 * which corresponds to the passed block_device, block and size. The
 * returned buffer has its reference count incremented.
 *
- * __getblk() cannot fail - it just keeps trying.  If you pass it an
- * illegal block number, __getblk() will happily return a buffer_head
- * which represents the non-existent block.  Very weird.
- *
 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
 * attempt is failing.  FIXME, perhaps?
 */
@@ -2306,8 +2300,8 @@ EXPORT_SYMBOL(block_commit_write);
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
- * Direct callers of this function should call vfs_check_frozen() so that page
+ * Direct callers of this function should protect against filesystem freezing
- * fault does not busyloop until the fs is thawed.
+ * using sb_start_write() - sb_end_write() functions.
 */
 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
@@ -2318,6 +2312,12 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        loff_t size;
        int ret;
+        /*
+         * Update file times before taking page lock. We may end up failing the
+         * fault so this update may be superfluous but who really cares...
+         */
+        file_update_time(vma->vm_file);
        lock_page(page);
        size = i_size_read(inode);
        if ((page->mapping != inode->i_mapping) ||
@@ -2339,18 +2339,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (unlikely(ret < 0))
                goto out_unlock;
-        /*
-         * Freezing in progress? We check after the page is marked dirty and
-         * with page lock held so if the test here fails, we are sure freezing
-         * code will wait during syncing until the page fault is done - at that
-         * point page will be dirty and unlocked so freezing code will write it
-         * and writeprotect it again.
-         */
        set_page_dirty(page);
-        if (inode->i_sb->s_frozen != SB_UNFROZEN) {
-                ret = -EAGAIN;
-                goto out_unlock;
-        }
        wait_on_page_writeback(page);
        return 0;
 out_unlock:
@@ -2365,12 +2354,9 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        int ret;
        struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
-        /*
+        sb_start_pagefault(sb);
-         * This check is racy but catches the common case. The check in
-         * __block_page_mkwrite() is reliable.
-         */
-        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        ret = __block_page_mkwrite(vma, vmf, get_block);
+        sb_end_pagefault(sb);
        return block_page_mkwrite_return(ret);
 }
 EXPORT_SYMBOL(block_page_mkwrite);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 8b67304e4b80..452e71a1b753 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1184,6 +1184,9 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size, len;
        int ret;
+        /* Update time before taking page lock */
+        file_update_time(vma->vm_file);
        size = i_size_read(inode);
        if (off + PAGE_CACHE_SIZE <= size)
                len = PAGE_CACHE_SIZE;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fb962efdacee..6d59006bfa27 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
        int err = -ENOMEM;
        dout("ceph_fs_debugfs_init\n");
+        BUG_ON(!fsc->client->debugfs_dir);
        fsc->debugfs_congestion_kb =
                debugfs_create_file("writeback_congestion_kb",
                                    0600,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f391f1e75414..e5b77319c97b 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -633,44 +633,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
        return dentry;
 }
-int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
-                     struct file *file, unsigned flags, umode_t mode,
-                     int *opened)
-{
-        int err;
-        struct dentry *res = NULL;
-        if (!(flags & O_CREAT)) {
-                if (dentry->d_name.len > NAME_MAX)
-                        return -ENAMETOOLONG;
-                err = ceph_init_dentry(dentry);
-                if (err < 0)
-                        return err;
-                return ceph_lookup_open(dir, dentry, file, flags, mode, opened);
-        }
-        if (d_unhashed(dentry)) {
-                res = ceph_lookup(dir, dentry, 0);
-                if (IS_ERR(res))
-                        return PTR_ERR(res);
-                if (res)
-                        dentry = res;
-        }
-        /* We don't deal with positive dentries here */
-        if (dentry->d_inode)
-                return finish_no_open(file, res);
-        *opened |= FILE_CREATED;
-        err = ceph_lookup_open(dir, dentry, file, flags, mode, opened);
-        dput(res);
-        return err;
-}
 /*
 * If we do a create but get no trace back from the MDS, follow up with
 * a lookup (the VFS expects us to link up the provided dentry).
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 1b81d6c31878..ecebbc09bfc7 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
@@ -106,9 +107,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 }
 /*
- * If the filp already has private_data, that means the file was
- * already opened by intent during lookup, and we do nothing.
- *
 * If we already have the requisite capabilities, we can satisfy
 * the open request locally (no need to request new caps from the
 * MDS).  We do, however, need to inform the MDS (asynchronously)
@@ -207,24 +205,29 @@ out:
 /*
- * Do a lookup + open with a single request.
+ * Do a lookup + open with a single request.  If we get a non-existent
- *
+ * file or symlink, return 1 so the VFS can retry.
- * If this succeeds, but some subsequent check in the vfs
- * may_open() fails, the struct *file gets cleaned up (i.e.
- * ceph_release gets called).  So fear not!
 */
-int ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
                     struct file *file, unsigned flags, umode_t mode,
                     int *opened)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
        struct ceph_mds_client *mdsc = fsc->mdsc;
        struct ceph_mds_request *req;
-        struct dentry *ret;
+        struct dentry *dn;
        int err;
-        dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
+        dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
-             dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
+             dir, dentry, dentry->d_name.len, dentry->d_name.name,
+             d_unhashed(dentry) ? "unhashed" : "hashed", flags, mode);
+        if (dentry->d_name.len > NAME_MAX)
+                return -ENAMETOOLONG;
+        err = ceph_init_dentry(dentry);
+        if (err < 0)
+                return err;
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
@@ -241,22 +244,31 @@ int ceph_lookup_open(struct inode *dir, struct dentry *dentry,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
        err = ceph_handle_snapdir(req, dentry, err);
-        if (err)
+        if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
-                goto out;
-        if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
-        if (err)
-                goto out;
-        err = finish_open(file, req->r_dentry, ceph_open, opened);
-out:
-        ret = ceph_finish_lookup(req, dentry, err);
-        ceph_mdsc_put_request(req);
-        dout("ceph_lookup_open result=%p\n", ret);
-        if (IS_ERR(ret))
+        if (d_unhashed(dentry)) {
-                return PTR_ERR(ret);
+                dn = ceph_finish_lookup(req, dentry, err);
+                if (IS_ERR(dn))
+                        err = PTR_ERR(dn);
+        } else {
+                /* we were given a hashed negative dentry */
+                dn = NULL;
+        }
+        if (err)
+                goto out_err;
+        if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
+                /* make vfs retry on splice, ENOENT, or symlink */
+                dout("atomic_open finish_no_open on dn %p\n", dn);
+                err = finish_no_open(file, dn);
+        } else {
+                dout("atomic_open finish_open on dn %p\n", dn);
+                err = finish_open(file, dentry, ceph_open, opened);
+        }
-        dput(ret);
+out_err:
+        ceph_mdsc_put_request(req);
+        dout("atomic_open result=%d\n", err);
        return err;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 9fff9f3b17e4..4b5762ef7c2b 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -992,11 +992,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (rinfo->head->is_dentry) {
                struct inode *dir = req->r_locked_dir;
-                err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
+                if (dir) {
-                                 session, req->r_request_started, -1,
+                        err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
-                                 &req->r_caps_reservation);
+                                         session, req->r_request_started, -1,
-                if (err < 0)
+                                         &req->r_caps_reservation);
-                        return err;
+                        if (err < 0)
+                                return err;
+                } else {
+                        WARN_ON_ONCE(1);
+                }
        }
        /*
@@ -1004,6 +1008,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
         * will have trouble splicing in the virtual snapdir later
         */
        if (rinfo->head->is_dentry && !req->r_aborted &&
+            req->r_locked_dir &&
            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
                                               fsc->mount_options->snapdir_name,
                                               req->r_dentry->d_name.len))) {
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8e3fb69fbe62..1396ceb46797 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -42,7 +42,8 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
        /* validate striping parameters */
        if ((l->object_size & ~PAGE_MASK) ||
            (l->stripe_unit & ~PAGE_MASK) ||
-            ((unsigned)l->object_size % (unsigned)l->stripe_unit))
+            (l->stripe_unit != 0 &&
+             ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
                return -EINVAL;
        /* make sure it's a valid data pool */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ebc95cc652be..66ebe720e40d 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -806,9 +806,9 @@ extern int ceph_copy_from_page_vector(struct page **pages,
                                    loff_t off, size_t len);
 extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
 extern int ceph_open(struct inode *inode, struct file *file);
-extern int ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
-                             struct file *od, unsigned flags,
+                            struct file *file, unsigned flags, umode_t mode,
-                             umode_t mode, int *opened);
+                            int *opened);
 extern int ceph_release(struct inode *inode, struct file *filp);
 /* dir.c */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 497da5ce704c..977dc0e85ccb 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -246,6 +246,16 @@ struct smb_version_operations {
        bool (*can_echo)(struct TCP_Server_Info *);
        /* send echo request */
        int (*echo)(struct TCP_Server_Info *);
+        /* create directory */
+        int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *,
+                     struct cifs_sb_info *);
+        /* set info on created directory */
+        void (*mkdir_setinfo)(struct inode *, const char *,
+                              struct cifs_sb_info *, struct cifs_tcon *,
+                              const unsigned int);
+        /* remove directory */
+        int (*rmdir)(const unsigned int, struct cifs_tcon *, const char *,
+                     struct cifs_sb_info *);
 };
 struct smb_version_values {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index cf7fb185103c..f1bbf8305d3a 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -289,18 +289,15 @@ extern int CIFSSMBUnixSetFileInfo(const unsigned int xid,
                                  u16 fid, u32 pid_of_opener);
 extern int CIFSSMBUnixSetPathInfo(const unsigned int xid,
-                                  struct cifs_tcon *tcon, char *file_name,
+                                  struct cifs_tcon *tcon, const char *file_name,
                                  const struct cifs_unix_set_info_args *args,
                                  const struct nls_table *nls_codepage,
-                                  int remap_special_chars);
+                                  int remap);
 extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
-                        const char *newName,
+                        const char *name, struct cifs_sb_info *cifs_sb);
-                        const struct nls_table *nls_codepage,
-                        int remap_special_chars);
 extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
-                        const char *name, const struct nls_table *nls_codepage,
+                        const char *name, struct cifs_sb_info *cifs_sb);
-                        int remap_special_chars);
 extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
                        const char *name, __u16 type,
                        const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index cabc7a01f5df..f0cf934ba877 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -948,15 +948,15 @@ DelFileRetry:
 }
 int
-CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon,
+CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
-             const char *dirName, const struct nls_table *nls_codepage,
+             struct cifs_sb_info *cifs_sb)
-             int remap)
 {
        DELETE_DIRECTORY_REQ *pSMB = NULL;
        DELETE_DIRECTORY_RSP *pSMBr = NULL;
        int rc = 0;
        int bytes_returned;
        int name_len;
+        int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
        cFYI(1, "In CIFSSMBRmDir");
 RmDirRetry:
@@ -966,14 +966,15 @@ RmDirRetry:
                return rc;
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
-                name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
+                name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
-                                              PATH_MAX, nls_codepage, remap);
+                                              PATH_MAX, cifs_sb->local_nls,
+                                              remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
-                name_len = strnlen(dirName, PATH_MAX);
+                name_len = strnlen(name, PATH_MAX);
                name_len++;     /* trailing null */
-                strncpy(pSMB->DirName, dirName, name_len);
+                strncpy(pSMB->DirName, name, name_len);
        }
        pSMB->BufferFormat = 0x04;
@@ -992,14 +993,15 @@ RmDirRetry:
 }
 int
-CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon,
+CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
-             const char *name, const struct nls_table *nls_codepage, int remap)
+             struct cifs_sb_info *cifs_sb)
 {
        int rc = 0;
        CREATE_DIRECTORY_REQ *pSMB = NULL;
        CREATE_DIRECTORY_RSP *pSMBr = NULL;
        int bytes_returned;
        int name_len;
+        int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
        cFYI(1, "In CIFSSMBMkDir");
 MkDirRetry:
@@ -1010,7 +1012,8 @@ MkDirRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
-                                              PATH_MAX, nls_codepage, remap);
+                                              PATH_MAX, cifs_sb->local_nls,
+                                              remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {                /* BB improve check for buffer overruns BB */
@@ -1573,9 +1576,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
                /* result already set, check signature */
                if (server->sec_mode &
                    (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
-                        if (cifs_verify_signature(rdata->iov, rdata->nr_iov,
+                        int rc = 0;
-                                          server, mid->sequence_number + 1))
-                                cERROR(1, "Unexpected SMB signature");
+                        rc = cifs_verify_signature(rdata->iov, rdata->nr_iov,
+                                                   server,
+                                                   mid->sequence_number + 1);
+                        if (rc)
+                                cERROR(1, "SMB signature verification returned "
+                                       "error = %d", rc);
                }
                /* FIXME: should this be counted toward the initiating task? */
                task_io_account_read(rdata->bytes);
@@ -5943,7 +5951,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 int
 CIFSSMBUnixSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
-                       char *fileName,
+                       const char *file_name,
                       const struct cifs_unix_set_info_args *args,
                       const struct nls_table *nls_codepage, int remap)
 {
@@ -5964,14 +5972,14 @@ setPermsRetry:
        if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
                name_len =
-                    cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
+                    cifsConvertToUTF16((__le16 *) pSMB->FileName, file_name,
                                       PATH_MAX, nls_codepage, remap);
                name_len++;     /* trailing null */
                name_len *= 2;
        } else {        /* BB improve the check for buffer overruns BB */
-                name_len = strnlen(fileName, PATH_MAX);
+                name_len = strnlen(file_name, PATH_MAX);
                name_len++;     /* trailing null */
-                strncpy(pSMB->FileName, fileName, name_len);
+                strncpy(pSMB->FileName, file_name, name_len);
        }
        params = 6 + name_len;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index cbe709ad6663..781025be48bc 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -356,19 +356,12 @@ cifs_create_get_file_info:
 cifs_create_set_dentry:
        if (rc != 0) {
                cFYI(1, "Create worked, get_inode_info failed rc = %d", rc);
+                CIFSSMBClose(xid, tcon, *fileHandle);
                goto out;
        }
        d_drop(direntry);
        d_add(direntry, newinode);
-        /* ENOENT for create?  How weird... */
-        rc = -ENOENT;
-        if (!newinode) {
-                CIFSSMBClose(xid, tcon, *fileHandle);
-                goto out;
-        }
-        rc = 0;
 out:
        kfree(buf);
        kfree(full_path);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9154192b0683..71e9ad9f5961 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -917,7 +917,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        if (!buf) {
                mutex_unlock(&cinode->lock_mutex);
                free_xid(xid);
-                return rc;
+                return -ENOMEM;
        }
        for (i = 0; i < 2; i++) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 35cb6a374a45..cb79c7edecb0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -124,10 +124,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 {
        struct cifsInodeInfo *cifs_i = CIFS_I(inode);
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-        unsigned long oldtime = cifs_i->time;
        cifs_revalidate_cache(inode, fattr);
+        spin_lock(&inode->i_lock);
        inode->i_atime = fattr->cf_atime;
        inode->i_mtime = fattr->cf_mtime;
        inode->i_ctime = fattr->cf_ctime;
@@ -148,9 +148,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
        else
                cifs_i->time = jiffies;
-        cFYI(1, "inode 0x%p old_time=%ld new_time=%ld", inode,
-                 oldtime, cifs_i->time);
        cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
        cifs_i->server_eof = fattr->cf_eof;
@@ -158,7 +155,6 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
         * Can't safely change the file size here if the client is writing to
         * it due to potential races.
         */
-        spin_lock(&inode->i_lock);
        if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
                i_size_write(inode, fattr->cf_eof);
@@ -859,12 +855,14 @@ struct inode *cifs_root_iget(struct super_block *sb)
        if (rc && tcon->ipc) {
                cFYI(1, "ipc connection - fake read inode");
+                spin_lock(&inode->i_lock);
                inode->i_mode |= S_IFDIR;
                set_nlink(inode, 2);
                inode->i_op = &cifs_ipc_inode_ops;
                inode->i_fop = &simple_dir_operations;
                inode->i_uid = cifs_sb->mnt_uid;
                inode->i_gid = cifs_sb->mnt_gid;
+                spin_unlock(&inode->i_lock);
        } else if (rc) {
                iget_failed(inode);
                inode = ERR_PTR(rc);
@@ -1110,6 +1108,15 @@ undo_setattr:
        goto out_close;
 }
+/* copied from fs/nfs/dir.c with small changes */
+static void
+cifs_drop_nlink(struct inode *inode)
+{
+        spin_lock(&inode->i_lock);
+        if (inode->i_nlink > 0)
+                drop_nlink(inode);
+        spin_unlock(&inode->i_lock);
+}
 /*
 * If dentry->d_inode is null (usually meaning the cached dentry
@@ -1166,13 +1173,13 @@ retry_std_delete:
 psx_del_no_retry:
        if (!rc) {
                if (inode)
-                        drop_nlink(inode);
+                        cifs_drop_nlink(inode);
        } else if (rc == -ENOENT) {
                d_drop(dentry);
        } else if (rc == -ETXTBSY) {
                rc = cifs_rename_pending_delete(full_path, dentry, xid);
                if (rc == 0)
-                        drop_nlink(inode);
+                        cifs_drop_nlink(inode);
        } else if ((rc == -EACCES) && (dosattr == 0) && inode) {
                attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
                if (attrs == NULL) {
@@ -1219,16 +1226,154 @@ unlink_out:
        return rc;
 }
+static int
+cifs_mkdir_qinfo(struct inode *inode, struct dentry *dentry, umode_t mode,
+                 const char *full_path, struct cifs_sb_info *cifs_sb,
+                 struct cifs_tcon *tcon, const unsigned int xid)
+{
+        int rc = 0;
+        struct inode *newinode = NULL;
+        if (tcon->unix_ext)
+                rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb,
+                                              xid);
+        else
+                rc = cifs_get_inode_info(&newinode, full_path, NULL,
+                                         inode->i_sb, xid, NULL);
+        if (rc)
+                return rc;
+        d_instantiate(dentry, newinode);
+        /*
+         * setting nlink not necessary except in cases where we failed to get it
+         * from the server or was set bogus
+         */
+        spin_lock(&dentry->d_inode->i_lock);
+        if ((dentry->d_inode) && (dentry->d_inode->i_nlink < 2))
+                set_nlink(dentry->d_inode, 2);
+        spin_unlock(&dentry->d_inode->i_lock);
+        mode &= ~current_umask();
+        /* must turn on setgid bit if parent dir has it */
+        if (inode->i_mode & S_ISGID)
+                mode |= S_ISGID;
+        if (tcon->unix_ext) {
+                struct cifs_unix_set_info_args args = {
+                        .mode   = mode,
+                        .ctime  = NO_CHANGE_64,
+                        .atime  = NO_CHANGE_64,
+                        .mtime  = NO_CHANGE_64,
+                        .device = 0,
+                };
+                if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+                        args.uid = (__u64)current_fsuid();
+                        if (inode->i_mode & S_ISGID)
+                                args.gid = (__u64)inode->i_gid;
+                        else
+                                args.gid = (__u64)current_fsgid();
+                } else {
+                        args.uid = NO_CHANGE_64;
+                        args.gid = NO_CHANGE_64;
+                }
+                CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
+                                       cifs_sb->local_nls,
+                                       cifs_sb->mnt_cifs_flags &
+                                       CIFS_MOUNT_MAP_SPECIAL_CHR);
+        } else {
+                struct TCP_Server_Info *server = tcon->ses->server;
+                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
+                    (mode & S_IWUGO) == 0 && server->ops->mkdir_setinfo)
+                        server->ops->mkdir_setinfo(newinode, full_path, cifs_sb,
+                                                   tcon, xid);
+                if (dentry->d_inode) {
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+                                dentry->d_inode->i_mode = (mode | S_IFDIR);
+                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+                                dentry->d_inode->i_uid = current_fsuid();
+                                if (inode->i_mode & S_ISGID)
+                                        dentry->d_inode->i_gid = inode->i_gid;
+                                else
+                                        dentry->d_inode->i_gid =
+                                                                current_fsgid();
+                        }
+                }
+        }
+        return rc;
+}
+static int
+cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
+                 const char *full_path, struct cifs_sb_info *cifs_sb,
+                 struct cifs_tcon *tcon, const unsigned int xid)
+{
+        int rc = 0;
+        u32 oplock = 0;
+        FILE_UNIX_BASIC_INFO *info = NULL;
+        struct inode *newinode = NULL;
+        struct cifs_fattr fattr;
+        info = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+        if (info == NULL) {
+                rc = -ENOMEM;
+                goto posix_mkdir_out;
+        }
+        mode &= ~current_umask();
+        rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
+                             NULL /* netfid */, info, &oplock, full_path,
+                             cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+                             CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc == -EOPNOTSUPP)
+                goto posix_mkdir_out;
+        else if (rc) {
+                cFYI(1, "posix mkdir returned 0x%x", rc);
+                d_drop(dentry);
+                goto posix_mkdir_out;
+        }
+        if (info->Type == cpu_to_le32(-1))
+                /* no return info, go query for it */
+                goto posix_mkdir_get_info;
+        /*
+         * BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if
+         * need to set uid/gid.
+         */
+        cifs_unix_basic_to_fattr(&fattr, info, cifs_sb);
+        cifs_fill_uniqueid(inode->i_sb, &fattr);
+        newinode = cifs_iget(inode->i_sb, &fattr);
+        if (!newinode)
+                goto posix_mkdir_get_info;
+        d_instantiate(dentry, newinode);
+#ifdef CONFIG_CIFS_DEBUG2
+        cFYI(1, "instantiated dentry %p %s to inode %p", dentry,
+             dentry->d_name.name, newinode);
+        if (newinode->i_nlink != 2)
+                cFYI(1, "unexpected number of links %d", newinode->i_nlink);
+#endif
+posix_mkdir_out:
+        kfree(info);
+        return rc;
+posix_mkdir_get_info:
+        rc = cifs_mkdir_qinfo(inode, dentry, mode, full_path, cifs_sb, tcon,
+                              xid);
+        goto posix_mkdir_out;
+}
 int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
 {
-        int rc = 0, tmprc;
+        int rc = 0;
        unsigned int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
        struct cifs_tcon *tcon;
-        char *full_path = NULL;
+        struct TCP_Server_Info *server;
-        struct inode *newinode = NULL;
+        char *full_path;
-        struct cifs_fattr fattr;
        cFYI(1, "In cifs_mkdir, mode = 0x%hx inode = 0x%p", mode, inode);
@@ -1248,145 +1393,29 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
        if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP &
                                le64_to_cpu(tcon->fsUnixInfo.Capability))) {
-                u32 oplock = 0;
+                rc = cifs_posix_mkdir(inode, direntry, mode, full_path, cifs_sb,
-                FILE_UNIX_BASIC_INFO *pInfo =
+                                      tcon, xid);
-                        kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+                if (rc != -EOPNOTSUPP)
-                if (pInfo == NULL) {
-                        rc = -ENOMEM;
                        goto mkdir_out;
-                }
+        }
-                mode &= ~current_umask();
-                rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT,
-                                mode, NULL /* netfid */, pInfo, &oplock,
-                                full_path, cifs_sb->local_nls,
-                                cifs_sb->mnt_cifs_flags &
-                                        CIFS_MOUNT_MAP_SPECIAL_CHR);
-                if (rc == -EOPNOTSUPP) {
-                        kfree(pInfo);
-                        goto mkdir_retry_old;
-                } else if (rc) {
-                        cFYI(1, "posix mkdir returned 0x%x", rc);
-                        d_drop(direntry);
-                } else {
-                        if (pInfo->Type == cpu_to_le32(-1)) {
-                                /* no return info, go query for it */
-                                kfree(pInfo);
-                                goto mkdir_get_info;
-                        }
-/*BB check (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID ) to see if need
-        to set uid/gid */
-                        cifs_unix_basic_to_fattr(&fattr, pInfo, cifs_sb);
-                        cifs_fill_uniqueid(inode->i_sb, &fattr);
-                        newinode = cifs_iget(inode->i_sb, &fattr);
-                        if (!newinode) {
-                                kfree(pInfo);
-                                goto mkdir_get_info;
-                        }
-                        d_instantiate(direntry, newinode);
-#ifdef CONFIG_CIFS_DEBUG2
+        server = tcon->ses->server;
-                        cFYI(1, "instantiated dentry %p %s to inode %p",
-                                direntry, direntry->d_name.name, newinode);
-                        if (newinode->i_nlink != 2)
+        if (!server->ops->mkdir) {
-                                cFYI(1, "unexpected number of links %d",
+                rc = -ENOSYS;
-                                        newinode->i_nlink);
-#endif
-                }
-                kfree(pInfo);
                goto mkdir_out;
        }
-mkdir_retry_old:
        /* BB add setting the equivalent of mode via CreateX w/ACLs */
-        rc = CIFSSMBMkDir(xid, tcon, full_path, cifs_sb->local_nls,
+        rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb);
-                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        if (rc) {
                cFYI(1, "cifs_mkdir returned 0x%x", rc);
                d_drop(direntry);
-        } else {
+                goto mkdir_out;
-mkdir_get_info:
-                if (tcon->unix_ext)
-                        rc = cifs_get_inode_info_unix(&newinode, full_path,
-                                                      inode->i_sb, xid);
-                else
-                        rc = cifs_get_inode_info(&newinode, full_path, NULL,
-                                                 inode->i_sb, xid, NULL);
-                d_instantiate(direntry, newinode);
-                 /* setting nlink not necessary except in cases where we
-                  * failed to get it from the server or was set bogus */
-                if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
-                        set_nlink(direntry->d_inode, 2);
-                mode &= ~current_umask();
-                /* must turn on setgid bit if parent dir has it */
-                if (inode->i_mode & S_ISGID)
-                        mode |= S_ISGID;
-                if (tcon->unix_ext) {
-                        struct cifs_unix_set_info_args args = {
-                                .mode   = mode,
-                                .ctime  = NO_CHANGE_64,
-                                .atime  = NO_CHANGE_64,
-                                .mtime  = NO_CHANGE_64,
-                                .device = 0,
-                        };
-                        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-                                args.uid = (__u64)current_fsuid();
-                                if (inode->i_mode & S_ISGID)
-                                        args.gid = (__u64)inode->i_gid;
-                                else
-                                        args.gid = (__u64)current_fsgid();
-                        } else {
-                                args.uid = NO_CHANGE_64;
-                                args.gid = NO_CHANGE_64;
-                        }
-                        CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
-                                               cifs_sb->local_nls,
-                                               cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                } else {
-                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
-                            (mode & S_IWUGO) == 0) {
-                                FILE_BASIC_INFO pInfo;
-                                struct cifsInodeInfo *cifsInode;
-                                u32 dosattrs;
-                                memset(&pInfo, 0, sizeof(pInfo));
-                                cifsInode = CIFS_I(newinode);
-                                dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
-                                pInfo.Attributes = cpu_to_le32(dosattrs);
-                                tmprc = CIFSSMBSetPathInfo(xid, tcon,
-                                                full_path, &pInfo,
-                                                cifs_sb->local_nls,
-                                                cifs_sb->mnt_cifs_flags &
-                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
-                                if (tmprc == 0)
-                                        cifsInode->cifsAttrs = dosattrs;
-                        }
-                        if (direntry->d_inode) {
-                                if (cifs_sb->mnt_cifs_flags &
-                                     CIFS_MOUNT_DYNPERM)
-                                        direntry->d_inode->i_mode =
-                                                (mode | S_IFDIR);
-                                if (cifs_sb->mnt_cifs_flags &
-                                     CIFS_MOUNT_SET_UID) {
-                                        direntry->d_inode->i_uid =
-                                                current_fsuid();
-                                        if (inode->i_mode & S_ISGID)
-                                                direntry->d_inode->i_gid =
-                                                        inode->i_gid;
-                                        else
-                                                direntry->d_inode->i_gid =
-                                                        current_fsgid();
-                                }
-                        }
-                }
        }
+        rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon,
+                              xid);
 mkdir_out:
        /*
         * Force revalidate to get parent dir info when needed since cached
@@ -1405,7 +1434,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
        unsigned int xid;
        struct cifs_sb_info *cifs_sb;
        struct tcon_link *tlink;
-        struct cifs_tcon *pTcon;
+        struct cifs_tcon *tcon;
+        struct TCP_Server_Info *server;
        char *full_path = NULL;
        struct cifsInodeInfo *cifsInode;
@@ -1425,10 +1455,16 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
                rc = PTR_ERR(tlink);
                goto rmdir_exit;
        }
-        pTcon = tlink_tcon(tlink);
+        tcon = tlink_tcon(tlink);
+        server = tcon->ses->server;
+        if (!server->ops->rmdir) {
+                rc = -ENOSYS;
+                cifs_put_tlink(tlink);
+                goto rmdir_exit;
+        }
-        rc = CIFSSMBRmDir(xid, pTcon, full_path, cifs_sb->local_nls,
+        rc = server->ops->rmdir(xid, tcon, full_path, cifs_sb);
-                          cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
        cifs_put_tlink(tlink);
        if (!rc) {
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 09e4b3ae4564..e6ce3b112875 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -433,7 +433,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
        if (old_file->d_inode) {
                cifsInode = CIFS_I(old_file->d_inode);
                if (rc == 0) {
+                        spin_lock(&old_file->d_inode->i_lock);
                        inc_nlink(old_file->d_inode);
+                        spin_unlock(&old_file->d_inode->i_lock);
 /* BB should we make this contingent on superblock flag NOATIME? */
 /*                      old_file->d_inode->i_ctime = CURRENT_TIME;*/
                        /* parent dir timestamps will update from srv
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index c40356d24c5c..3129ac74b819 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,6 +586,27 @@ cifs_print_stats(struct seq_file *m, struct cifs_tcon *tcon)
 #endif
 }
+static void
+cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
+                   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+                   const unsigned int xid)
+{
+        FILE_BASIC_INFO info;
+        struct cifsInodeInfo *cifsInode;
+        u32 dosattrs;
+        int rc;
+        memset(&info, 0, sizeof(info));
+        cifsInode = CIFS_I(inode);
+        dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+        info.Attributes = cpu_to_le32(dosattrs);
+        rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
+                                cifs_sb->mnt_cifs_flags &
+                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
+        if (rc == 0)
+                cifsInode->cifsAttrs = dosattrs;
+}
 struct smb_version_operations smb1_operations = {
        .send_cancel = send_nt_cancel,
        .compare_fids = cifs_compare_fids,
@@ -620,6 +641,9 @@ struct smb_version_operations smb1_operations = {
        .get_srv_inum = cifs_get_srv_inum,
        .build_path_to_root = cifs_build_path_to_root,
        .echo = CIFSSMBEcho,
+        .mkdir = CIFSSMBMkDir,
+        .mkdir_setinfo = cifs_mkdir_setinfo,
+        .rmdir = CIFSSMBRmDir,
 };
 struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 1ba5c405315c..2aa5cb08c526 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -122,3 +122,42 @@ out:
        kfree(smb2_data);
        return rc;
 }
+int
+smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+           struct cifs_sb_info *cifs_sb)
+{
+        return smb2_open_op_close(xid, tcon, cifs_sb, name,
+                                  FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+                                  CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR);
+}
+void
+smb2_mkdir_setinfo(struct inode *inode, const char *name,
+                   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
+                   const unsigned int xid)
+{
+        FILE_BASIC_INFO data;
+        struct cifsInodeInfo *cifs_i;
+        u32 dosattrs;
+        int tmprc;
+        memset(&data, 0, sizeof(data));
+        cifs_i = CIFS_I(inode);
+        dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
+        data.Attributes = cpu_to_le32(dosattrs);
+        tmprc = smb2_open_op_close(xid, tcon, cifs_sb, name,
+                                   FILE_WRITE_ATTRIBUTES, FILE_CREATE, 0,
+                                   CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO);
+        if (tmprc == 0)
+                cifs_i->cifsAttrs = dosattrs;
+}
+int
+smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
+           struct cifs_sb_info *cifs_sb)
+{
+        return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
+                                  0, CREATE_NOT_FILE | CREATE_DELETE_ON_CLOSE,
+                                  NULL, SMB2_OP_DELETE);
+}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index a4ff5d547554..e4d3b9964167 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -52,7 +52,8 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
                        cERROR(1, "Bad protocol string signature header %x",
                                  *(unsigned int *) hdr->ProtocolId);
                if (mid != hdr->MessageId)
-                        cERROR(1, "Mids do not match");
+                        cERROR(1, "Mids do not match: %llu and %llu", mid,
+                                  hdr->MessageId);
        }
        cERROR(1, "Bad SMB detected. The Mid=%llu", hdr->MessageId);
        return 1;
@@ -107,7 +108,7 @@ smb2_check_message(char *buf, unsigned int length)
         * ie Validate the wct via smb2_struct_sizes table above
         */
-        if (length < 2 + sizeof(struct smb2_hdr)) {
+        if (length < sizeof(struct smb2_pdu)) {
                if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
                        pdu->StructureSize2 = 0;
                        /*
@@ -121,15 +122,15 @@ smb2_check_message(char *buf, unsigned int length)
                return 1;
        }
        if (len > CIFSMaxBufSize + MAX_SMB2_HDR_SIZE - 4) {
-                cERROR(1, "SMB length greater than maximum, mid=%lld", mid);
+                cERROR(1, "SMB length greater than maximum, mid=%llu", mid);
                return 1;
        }
        if (check_smb2_hdr(hdr, mid))
                return 1;
-        if (hdr->StructureSize != SMB2_HEADER_SIZE) {
+        if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
-                cERROR(1, "Illegal structure size %d",
+                cERROR(1, "Illegal structure size %u",
                          le16_to_cpu(hdr->StructureSize));
                return 1;
        }
@@ -161,8 +162,9 @@ smb2_check_message(char *buf, unsigned int length)
        if (4 + len != clc_len) {
                cFYI(1, "Calculated size %u length %u mismatch mid %llu",
                        clc_len, 4 + len, mid);
-                if (clc_len == 4 + len + 1) /* BB FIXME (fix samba) */
+                /* server can return one byte more */
-                        return 0; /* BB workaround Samba 3 bug SessSetup rsp */
+                if (clc_len == 4 + len + 1)
+                        return 0;
                return 1;
        }
        return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 410cf925ea26..826209bf3684 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -318,6 +318,9 @@ struct smb_version_operations smb21_operations = {
        .query_path_info = smb2_query_path_info,
        .get_srv_inum = smb2_get_srv_inum,
        .build_path_to_root = smb2_build_path_to_root,
+        .mkdir = smb2_mkdir,
+        .mkdir_setinfo = smb2_mkdir_setinfo,
+        .rmdir = smb2_rmdir,
 };
 struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f37a1b41b402..15dc8eea8273 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -87,10 +87,6 @@
 #define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe)
-#define SMB2_HEADER_SIZE __constant_le16_to_cpu(64)
-#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_le16_to_cpu(9)
 /*
 * SMB2 Header Definition
 *
@@ -99,6 +95,9 @@
 * "PDU" :  "Protocol Data Unit" (ie a network "frame")
 *
 */
+#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64)
 struct smb2_hdr {
        __be32 smb2_buf_length; /* big endian on wire */
                                /* length is only two or three bytes - with
@@ -140,6 +139,9 @@ struct smb2_pdu {
 *  command code name for the struct. Note that structures must be packed.
 *
 */
+#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9)
 struct smb2_err_rsp {
        struct smb2_hdr hdr;
        __le16 StructureSize;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 902bbe2b5ad3..bfaa7b148afd 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -52,6 +52,14 @@ extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
                                struct cifs_sb_info *cifs_sb,
                                const char *full_path, FILE_ALL_INFO *data,
                                bool *adjust_tz);
+extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon,
+                      const char *name, struct cifs_sb_info *cifs_sb);
+extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
+                               struct cifs_sb_info *cifs_sb,
+                               struct cifs_tcon *tcon, const unsigned int xid);
+extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
+                      const char *name, struct cifs_sb_info *cifs_sb);
 /*
 * SMB2 Worker functions - most of protocol specific implementation details
 * are contained within these calls.
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 83867ef348df..d9b639b95fa8 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -503,13 +503,16 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
        /* convert the length into a more usable form */
        if (server->sec_mode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
                struct kvec iov;
+                int rc = 0;
                iov.iov_base = mid->resp_buf;
                iov.iov_len = len;
                /* FIXME: add code to kill session */
-                if (cifs_verify_signature(&iov, 1, server,
+                rc = cifs_verify_signature(&iov, 1, server,
-                                          mid->sequence_number + 1) != 0)
+                                           mid->sequence_number + 1);
-                        cERROR(1, "Unexpected SMB signature");
+                if (rc)
+                        cERROR(1, "SMB signature verification returned error = "
+                               "%d", rc);
        }
        /* BB special case reconnect tid and uid here? */
diff --git a/fs/compat.c b/fs/compat.c
index 6161255fac45..1bdb350ea5d3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1155,11 +1155,14 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
        struct file *file;
        int fput_needed;
        ssize_t ret;
+        loff_t pos;
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_readv(file, vec, vlen, &file->f_pos);
+        pos = file->f_pos;
+        ret = compat_readv(file, vec, vlen, &pos);
+        file->f_pos = pos;
        fput_light(file, fput_needed);
        return ret;
 }
@@ -1221,11 +1224,14 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
        struct file *file;
        int fput_needed;
        ssize_t ret;
+        loff_t pos;
        file = fget_light(fd, &fput_needed);
        if (!file)
                return -EBADF;
-        ret = compat_writev(file, vec, vlen, &file->f_pos);
+        pos = file->f_pos;
+        ret = compat_writev(file, vec, vlen, &pos);
+        file->f_pos = pos;
        fput_light(file, fput_needed);
        return ret;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1faf4cb56f39..f86c720dba0e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1062,6 +1062,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        unsigned long user_addr;
        size_t bytes;
        struct buffer_head map_bh = { 0, };
+        struct blk_plug plug;
        if (rw & WRITE)
                rw = WRITE_ODIRECT;
@@ -1177,6 +1178,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                                PAGE_SIZE - user_addr / PAGE_SIZE);
        }
+        blk_start_plug(&plug);
        for (seg = 0; seg < nr_segs; seg++) {
                user_addr = (unsigned long)iov[seg].iov_base;
                sdio.size += bytes = iov[seg].iov_len;
@@ -1235,6 +1238,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (sdio.bio)
                dio_bio_submit(dio, &sdio);
+        blk_finish_plug(&plug);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 989e034f02bd..cfb4b9fed520 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -385,8 +385,6 @@ struct ecryptfs_msg_ctx {
        struct mutex mux;
 };
-struct ecryptfs_daemon;
 struct ecryptfs_daemon {
 #define ECRYPTFS_DAEMON_IN_READ      0x00000001
 #define ECRYPTFS_DAEMON_IN_POLL      0x00000002
@@ -394,10 +392,7 @@ struct ecryptfs_daemon {
 #define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008
        u32 flags;
        u32 num_queued_msg_ctx;
-        struct pid *pid;
+        struct file *file;
-        uid_t euid;
-        struct user_namespace *user_ns;
-        struct task_struct *task;
        struct mutex mux;
        struct list_head msg_ctx_out_queue;
        wait_queue_head_t wait;
@@ -554,6 +549,8 @@ extern struct kmem_cache *ecryptfs_key_tfm_cache;
 struct inode *ecryptfs_get_inode(struct inode *lower_inode,
                                 struct super_block *sb);
 void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
+int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+                             struct inode *ecryptfs_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
                                         size_t *decrypted_name_size,
                                         struct dentry *ecryptfs_dentry,
@@ -607,13 +604,8 @@ int
 ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags);
 int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode);
-int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns,
+int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
-                          struct pid *pid);
+                              struct ecryptfs_message *msg, u32 seq);
-int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
-                          struct pid *pid);
-int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
-                              struct user_namespace *user_ns, struct pid *pid,
-                              u32 seq);
 int ecryptfs_send_message(char *data, int data_len,
                          struct ecryptfs_msg_ctx **msg_ctx);
 int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx,
@@ -658,8 +650,7 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
                                     struct inode *ecryptfs_inode);
 struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
 int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
-int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon);
-                                 struct user_namespace *user_ns);
 int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
                                 size_t *length_size);
 int ecryptfs_write_packet_length(char *dest, size_t size,
@@ -671,8 +662,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
                          u16 msg_flags, struct ecryptfs_daemon *daemon);
 void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
 int
-ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file);
-                      struct user_namespace *user_ns, struct pid *pid);
 int ecryptfs_init_kthread(void);
 void ecryptfs_destroy_kthread(void);
 int ecryptfs_privileged_open(struct file **lower_file,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2b17f2f9b121..44ce5c6a541d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -138,29 +138,50 @@ out:
        return rc;
 }
-static void ecryptfs_vma_close(struct vm_area_struct *vma)
+struct kmem_cache *ecryptfs_file_info_cache;
-{
-        filemap_write_and_wait(vma->vm_file->f_mapping);
-}
-static const struct vm_operations_struct ecryptfs_file_vm_ops = {
-        .close          = ecryptfs_vma_close,
-        .fault          = filemap_fault,
-};
-static int ecryptfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int read_or_initialize_metadata(struct dentry *dentry)
 {
+        struct inode *inode = dentry->d_inode;
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        struct ecryptfs_crypt_stat *crypt_stat;
        int rc;
-        rc = generic_file_mmap(file, vma);
+        crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                inode->i_sb)->mount_crypt_stat;
+        mutex_lock(&crypt_stat->cs_mutex);
+        if (crypt_stat->flags & ECRYPTFS_POLICY_APPLIED &&
+            crypt_stat->flags & ECRYPTFS_KEY_VALID) {
+                rc = 0;
+                goto out;
+        }
+        rc = ecryptfs_read_metadata(dentry);
        if (!rc)
-                vma->vm_ops = &ecryptfs_file_vm_ops;
+                goto out;
+        if (mount_crypt_stat->flags & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED) {
+                crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
+                                       | ECRYPTFS_ENCRYPTED);
+                rc = 0;
+                goto out;
+        }
+        if (!(mount_crypt_stat->flags & ECRYPTFS_XATTR_METADATA_ENABLED) &&
+            !i_size_read(ecryptfs_inode_to_lower(inode))) {
+                rc = ecryptfs_initialize_file(dentry, inode);
+                if (!rc)
+                        goto out;
+        }
+        rc = -EIO;
+out:
+        mutex_unlock(&crypt_stat->cs_mutex);
        return rc;
 }
-struct kmem_cache *ecryptfs_file_info_cache;
 /**
 * ecryptfs_open
 * @inode: inode speciying file to open
@@ -236,32 +257,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
                rc = 0;
                goto out;
        }
-        mutex_lock(&crypt_stat->cs_mutex);
+        rc = read_or_initialize_metadata(ecryptfs_dentry);
-        if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)
+        if (rc)
-            || !(crypt_stat->flags & ECRYPTFS_KEY_VALID)) {
+                goto out_put;
-                rc = ecryptfs_read_metadata(ecryptfs_dentry);
-                if (rc) {
-                        ecryptfs_printk(KERN_DEBUG,
-                                        "Valid headers not found\n");
-                        if (!(mount_crypt_stat->flags
-                              & ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED)) {
-                                rc = -EIO;
-                                printk(KERN_WARNING "Either the lower file "
-                                       "is not in a valid eCryptfs format, "
-                                       "or the key could not be retrieved. "
-                                       "Plaintext passthrough mode is not "
-                                       "enabled; returning -EIO\n");
-                                mutex_unlock(&crypt_stat->cs_mutex);
-                                goto out_put;
-                        }
-                        rc = 0;
-                        crypt_stat->flags &= ~(ECRYPTFS_I_SIZE_INITIALIZED
-                                               | ECRYPTFS_ENCRYPTED);
-                        mutex_unlock(&crypt_stat->cs_mutex);
-                        goto out;
-                }
-        }
-        mutex_unlock(&crypt_stat->cs_mutex);
        ecryptfs_printk(KERN_DEBUG, "inode w/ addr = [0x%p], i_ino = "
                        "[0x%.16lx] size: [0x%.16llx]\n", inode, inode->i_ino,
                        (unsigned long long)i_size_read(inode));
@@ -292,15 +290,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 static int
 ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 {
-        int rc = 0;
+        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
-        rc = generic_file_fsync(file, start, end, datasync);
-        if (rc)
-                goto out;
-        rc = vfs_fsync_range(ecryptfs_file_to_lower(file), start, end,
-                             datasync);
-out:
-        return rc;
 }
 static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -369,7 +359,7 @@ const struct file_operations ecryptfs_main_fops = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl = ecryptfs_compat_ioctl,
 #endif
-        .mmap = ecryptfs_file_mmap,
+        .mmap = generic_file_mmap,
        .open = ecryptfs_open,
        .flush = ecryptfs_flush,
        .release = ecryptfs_release,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index ffa2be57804d..534b129ea676 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -143,6 +143,31 @@ static int ecryptfs_interpose(struct dentry *lower_dentry,
        return 0;
 }
+static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
+                              struct inode *inode)
+{
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+        struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+        struct dentry *lower_dir_dentry;
+        int rc;
+        dget(lower_dentry);
+        lower_dir_dentry = lock_parent(lower_dentry);
+        rc = vfs_unlink(lower_dir_inode, lower_dentry);
+        if (rc) {
+                printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
+                goto out_unlock;
+        }
+        fsstack_copy_attr_times(dir, lower_dir_inode);
+        set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
+        inode->i_ctime = dir->i_ctime;
+        d_drop(dentry);
+out_unlock:
+        unlock_dir(lower_dir_dentry);
+        dput(lower_dentry);
+        return rc;
+}
 /**
 * ecryptfs_do_create
 * @directory_inode: inode of the new file's dentry's parent in ecryptfs
@@ -182,8 +207,10 @@ ecryptfs_do_create(struct inode *directory_inode,
        }
        inode = __ecryptfs_get_inode(lower_dentry->d_inode,
                                     directory_inode->i_sb);
-        if (IS_ERR(inode))
+        if (IS_ERR(inode)) {
+                vfs_unlink(lower_dir_dentry->d_inode, lower_dentry);
                goto out_lock;
+        }
        fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
        fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
 out_lock:
@@ -200,8 +227,8 @@ out:
 *
 * Returns zero on success
 */
-static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
+int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
-                                    struct inode *ecryptfs_inode)
+                             struct inode *ecryptfs_inode)
 {
        struct ecryptfs_crypt_stat *crypt_stat =
                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -264,7 +291,9 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
         * that this on disk file is prepared to be an ecryptfs file */
        rc = ecryptfs_initialize_file(ecryptfs_dentry, ecryptfs_inode);
        if (rc) {
-                drop_nlink(ecryptfs_inode);
+                ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
+                                   ecryptfs_inode);
+                make_bad_inode(ecryptfs_inode);
                unlock_new_inode(ecryptfs_inode);
                iput(ecryptfs_inode);
                goto out;
@@ -318,21 +347,20 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry,
        struct vfsmount *lower_mnt;
        int rc = 0;
-        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
-        fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
-        BUG_ON(!lower_dentry->d_count);
        dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
-        ecryptfs_set_dentry_private(dentry, dentry_info);
        if (!dentry_info) {
                printk(KERN_ERR "%s: Out of memory whilst attempting "
                       "to allocate ecryptfs_dentry_info struct\n",
                        __func__);
                dput(lower_dentry);
-                mntput(lower_mnt);
-                d_drop(dentry);
                return -ENOMEM;
        }
+        lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
+        fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
+        BUG_ON(!lower_dentry->d_count);
+        ecryptfs_set_dentry_private(dentry, dentry_info);
        ecryptfs_set_dentry_lower(dentry, lower_dentry);
        ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
@@ -381,12 +409,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        struct dentry *lower_dir_dentry, *lower_dentry;
        int rc = 0;
-        if ((ecryptfs_dentry->d_name.len == 1
-             && !strcmp(ecryptfs_dentry->d_name.name, "."))
-            || (ecryptfs_dentry->d_name.len == 2
-                && !strcmp(ecryptfs_dentry->d_name.name, ".."))) {
-                goto out_d_drop;
-        }
        lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
@@ -397,8 +419,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                rc = PTR_ERR(lower_dentry);
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                                encrypted_and_encoded_name);
+                                ecryptfs_dentry->d_name.name);
-                goto out_d_drop;
+                goto out;
        }
        if (lower_dentry->d_inode)
                goto interpose;
@@ -415,7 +437,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        if (rc) {
                printk(KERN_ERR "%s: Error attempting to encrypt and encode "
                       "filename; rc = [%d]\n", __func__, rc);
-                goto out_d_drop;
+                goto out;
        }
        mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
        lower_dentry = lookup_one_len(encrypted_and_encoded_name,
@@ -427,14 +449,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
                                encrypted_and_encoded_name);
-                goto out_d_drop;
+                goto out;
        }
 interpose:
        rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
                                       ecryptfs_dir_inode);
-        goto out;
-out_d_drop:
-        d_drop(ecryptfs_dentry);
 out:
        kfree(encrypted_and_encoded_name);
        return ERR_PTR(rc);
@@ -476,27 +495,7 @@ out_lock:
 static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        int rc = 0;
+        return ecryptfs_do_unlink(dir, dentry, dentry->d_inode);
-        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
-        struct dentry *lower_dir_dentry;
-        dget(lower_dentry);
-        lower_dir_dentry = lock_parent(lower_dentry);
-        rc = vfs_unlink(lower_dir_inode, lower_dentry);
-        if (rc) {
-                printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
-                goto out_unlock;
-        }
-        fsstack_copy_attr_times(dir, lower_dir_inode);
-        set_nlink(dentry->d_inode,
-                  ecryptfs_inode_to_lower(dentry->d_inode)->i_nlink);
-        dentry->d_inode->i_ctime = dir->i_ctime;
-        d_drop(dentry);
-out_unlock:
-        unlock_dir(lower_dir_dentry);
-        dput(lower_dentry);
-        return rc;
 }
 static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -971,12 +970,6 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
                        goto out;
        }
-        if (S_ISREG(inode->i_mode)) {
-                rc = filemap_write_and_wait(inode->i_mapping);
-                if (rc)
-                        goto out;
-                fsstack_copy_attr_all(inode, lower_inode);
-        }
        memcpy(&lower_ia, ia, sizeof(lower_ia));
        if (ia->ia_valid & ATTR_FILE)
                lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file);
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1c0b3b6b75c6..2768138eefee 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -279,6 +279,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
        char *fnek_src;
        char *cipher_key_bytes_src;
        char *fn_cipher_key_bytes_src;
+        u8 cipher_code;
        *check_ruid = 0;
@@ -420,6 +421,18 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
            && !fn_cipher_key_bytes_set)
                mount_crypt_stat->global_default_fn_cipher_key_bytes =
                        mount_crypt_stat->global_default_cipher_key_size;
+        cipher_code = ecryptfs_code_for_cipher_string(
+                mount_crypt_stat->global_default_cipher_name,
+                mount_crypt_stat->global_default_cipher_key_size);
+        if (!cipher_code) {
+                ecryptfs_printk(KERN_ERR,
+                                "eCryptfs doesn't support cipher: %s",
+                                mount_crypt_stat->global_default_cipher_name);
+                rc = -EINVAL;
+                goto out;
+        }
        mutex_lock(&key_tfm_list_mutex);
        if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name,
                                 NULL)) {
@@ -540,6 +553,15 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
        }
        ecryptfs_set_superblock_lower(s, path.dentry->d_sb);
+        /**
+         * Set the POSIX ACL flag based on whether they're enabled in the lower
+         * mount. Force a read-only eCryptfs mount if the lower mount is ro.
+         * Allow a ro eCryptfs mount even when the lower mount is rw.
+         */
+        s->s_flags = flags & ~MS_POSIXACL;
+        s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL);
        s->s_maxbytes = path.dentry->d_sb->s_maxbytes;
        s->s_blocksize = path.dentry->d_sb->s_blocksize;
        s->s_magic = ECRYPTFS_SUPER_MAGIC;
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index a750f957b145..b29bb8bfa8d9 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -32,8 +32,8 @@ static struct mutex ecryptfs_msg_ctx_lists_mux;
 static struct hlist_head *ecryptfs_daemon_hash;
 struct mutex ecryptfs_daemon_hash_mux;
 static int ecryptfs_hash_bits;
-#define ecryptfs_uid_hash(uid) \
+#define ecryptfs_current_euid_hash(uid) \
-        hash_long((unsigned long)uid, ecryptfs_hash_bits)
+                hash_long((unsigned long)current_euid(), ecryptfs_hash_bits)
 static u32 ecryptfs_msg_counter;
 static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr;
@@ -105,26 +105,24 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx)
 /**
 * ecryptfs_find_daemon_by_euid
- * @euid: The effective user id which maps to the desired daemon id
- * @user_ns: The namespace in which @euid applies
 * @daemon: If return value is zero, points to the desired daemon pointer
 *
 * Must be called with ecryptfs_daemon_hash_mux held.
 *
- * Search the hash list for the given user id.
+ * Search the hash list for the current effective user id.
 *
 * Returns zero if the user id exists in the list; non-zero otherwise.
 */
-int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
+int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon)
-                                 struct user_namespace *user_ns)
 {
        struct hlist_node *elem;
        int rc;
        hlist_for_each_entry(*daemon, elem,
-                             &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)],
+                            &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()],
-                             euid_chain) {
+                            euid_chain) {
-                if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) {
+                if ((*daemon)->file->f_cred->euid == current_euid() &&
+                    (*daemon)->file->f_cred->user_ns == current_user_ns()) {
                        rc = 0;
                        goto out;
                }
@@ -137,9 +135,7 @@ out:
 /**
 * ecryptfs_spawn_daemon - Create and initialize a new daemon struct
 * @daemon: Pointer to set to newly allocated daemon struct
- * @euid: Effective user id for the daemon
+ * @file: File used when opening /dev/ecryptfs
- * @user_ns: The namespace in which @euid applies
- * @pid: Process id for the daemon
 *
 * Must be called ceremoniously while in possession of
 * ecryptfs_sacred_daemon_hash_mux
@@ -147,8 +143,7 @@ out:
 * Returns zero on success; non-zero otherwise
 */
 int
-ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
+ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file)
-                      struct user_namespace *user_ns, struct pid *pid)
 {
        int rc = 0;
@@ -159,16 +154,13 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
                       "GFP_KERNEL memory\n", __func__, sizeof(**daemon));
                goto out;
        }
-        (*daemon)->euid = euid;
+        (*daemon)->file = file;
-        (*daemon)->user_ns = get_user_ns(user_ns);
-        (*daemon)->pid = get_pid(pid);
-        (*daemon)->task = current;
        mutex_init(&(*daemon)->mux);
        INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue);
        init_waitqueue_head(&(*daemon)->wait);
        (*daemon)->num_queued_msg_ctx = 0;
        hlist_add_head(&(*daemon)->euid_chain,
-                       &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]);
+                       &ecryptfs_daemon_hash[ecryptfs_current_euid_hash()]);
 out:
        return rc;
 }
@@ -188,9 +180,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
        if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ)
            || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) {
                rc = -EBUSY;
-                printk(KERN_WARNING "%s: Attempt to destroy daemon with pid "
-                       "[0x%p], but it is in the midst of a read or a poll\n",
-                       __func__, daemon->pid);
                mutex_unlock(&daemon->mux);
                goto out;
        }
@@ -203,12 +192,6 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
                ecryptfs_msg_ctx_alloc_to_free(msg_ctx);
        }
        hlist_del(&daemon->euid_chain);
-        if (daemon->task)
-                wake_up_process(daemon->task);
-        if (daemon->pid)
-                put_pid(daemon->pid);
-        if (daemon->user_ns)
-                put_user_ns(daemon->user_ns);
        mutex_unlock(&daemon->mux);
        kzfree(daemon);
 out:
@@ -216,42 +199,9 @@ out:
 }
 /**
- * ecryptfs_process_quit
- * @euid: The user ID owner of the message
- * @user_ns: The namespace in which @euid applies
- * @pid: The process ID for the userspace program that sent the
- *       message
- *
- * Deletes the corresponding daemon for the given euid and pid, if
- * it is the registered that is requesting the deletion. Returns zero
- * after deleting the desired daemon; non-zero otherwise.
- */
-int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns,
-                          struct pid *pid)
-{
-        struct ecryptfs_daemon *daemon;
-        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns);
-        if (rc || !daemon) {
-                rc = -EINVAL;
-                printk(KERN_ERR "Received request from user [%d] to "
-                       "unregister unrecognized daemon [0x%p]\n", euid, pid);
-                goto out_unlock;
-        }
-        rc = ecryptfs_exorcise_daemon(daemon);
-out_unlock:
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
-        return rc;
-}
-/**
 * ecryptfs_process_reponse
 * @msg: The ecryptfs message received; the caller should sanity check
 *       msg->data_len and free the memory
- * @pid: The process ID of the userspace application that sent the
- *       message
 * @seq: The sequence number of the message; must match the sequence
 *       number for the existing message context waiting for this
 *       response
@@ -270,16 +220,11 @@ out_unlock:
 *
 * Returns zero on success; non-zero otherwise
 */
-int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
+int ecryptfs_process_response(struct ecryptfs_daemon *daemon,
-                              struct user_namespace *user_ns, struct pid *pid,
+                              struct ecryptfs_message *msg, u32 seq)
-                              u32 seq)
 {
-        struct ecryptfs_daemon *uninitialized_var(daemon);
        struct ecryptfs_msg_ctx *msg_ctx;
        size_t msg_size;
-        struct nsproxy *nsproxy;
-        struct user_namespace *tsk_user_ns;
-        uid_t ctx_euid;
        int rc;
        if (msg->index >= ecryptfs_message_buf_len) {
@@ -292,51 +237,6 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        }
        msg_ctx = &ecryptfs_msg_ctx_arr[msg->index];
        mutex_lock(&msg_ctx->mux);
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rcu_read_lock();
-        nsproxy = task_nsproxy(msg_ctx->task);
-        if (nsproxy == NULL) {
-                rc = -EBADMSG;
-                printk(KERN_ERR "%s: Receiving process is a zombie. Dropping "
-                       "message.\n", __func__);
-                rcu_read_unlock();
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
-                goto wake_up;
-        }
-        tsk_user_ns = __task_cred(msg_ctx->task)->user_ns;
-        ctx_euid = task_euid(msg_ctx->task);
-        rc = ecryptfs_find_daemon_by_euid(&daemon, ctx_euid, tsk_user_ns);
-        rcu_read_unlock();
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
-        if (rc) {
-                rc = -EBADMSG;
-                printk(KERN_WARNING "%s: User [%d] received a "
-                       "message response from process [0x%p] but does "
-                       "not have a registered daemon\n", __func__,
-                       ctx_euid, pid);
-                goto wake_up;
-        }
-        if (ctx_euid != euid) {
-                rc = -EBADMSG;
-                printk(KERN_WARNING "%s: Received message from user "
-                       "[%d]; expected message from user [%d]\n", __func__,
-                       euid, ctx_euid);
-                goto unlock;
-        }
-        if (tsk_user_ns != user_ns) {
-                rc = -EBADMSG;
-                printk(KERN_WARNING "%s: Received message from user_ns "
-                       "[0x%p]; expected message from user_ns [0x%p]\n",
-                       __func__, user_ns, tsk_user_ns);
-                goto unlock;
-        }
-        if (daemon->pid != pid) {
-                rc = -EBADMSG;
-                printk(KERN_ERR "%s: User [%d] sent a message response "
-                       "from an unrecognized process [0x%p]\n",
-                       __func__, ctx_euid, pid);
-                goto unlock;
-        }
        if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) {
                rc = -EINVAL;
                printk(KERN_WARNING "%s: Desired context element is not "
@@ -359,9 +259,8 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid,
        }
        memcpy(msg_ctx->msg, msg, msg_size);
        msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE;
-        rc = 0;
-wake_up:
        wake_up_process(msg_ctx->task);
+        rc = 0;
 unlock:
        mutex_unlock(&msg_ctx->mux);
 out:
@@ -383,14 +282,11 @@ ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type,
                             struct ecryptfs_msg_ctx **msg_ctx)
 {
        struct ecryptfs_daemon *daemon;
-        uid_t euid = current_euid();
        int rc;
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+        rc = ecryptfs_find_daemon_by_euid(&daemon);
        if (rc || !daemon) {
                rc = -ENOTCONN;
-                printk(KERN_ERR "%s: User [%d] does not have a daemon "
-                       "registered\n", __func__, euid);
                goto out;
        }
        mutex_lock(&ecryptfs_msg_ctx_lists_mux);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index c0038f6566d4..412e6eda25f8 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -33,7 +33,7 @@ static atomic_t ecryptfs_num_miscdev_opens;
 /**
 * ecryptfs_miscdev_poll
- * @file: dev file (ignored)
+ * @file: dev file
 * @pt: dev poll table (ignored)
 *
 * Returns the poll mask
@@ -41,20 +41,10 @@ static atomic_t ecryptfs_num_miscdev_opens;
 static unsigned int
 ecryptfs_miscdev_poll(struct file *file, poll_table *pt)
 {
-        struct ecryptfs_daemon *daemon;
+        struct ecryptfs_daemon *daemon = file->private_data;
        unsigned int mask = 0;
-        uid_t euid = current_euid();
-        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        /* TODO: Just use file->private_data? */
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-        if (rc || !daemon) {
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
-                return -EINVAL;
-        }
        mutex_lock(&daemon->mux);
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
        if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
                printk(KERN_WARNING "%s: Attempt to poll on zombified "
                       "daemon\n", __func__);
@@ -79,7 +69,7 @@ out_unlock_daemon:
 /**
 * ecryptfs_miscdev_open
 * @inode: inode of miscdev handle (ignored)
- * @file: file for miscdev handle (ignored)
+ * @file: file for miscdev handle
 *
 * Returns zero on success; non-zero otherwise
 */
@@ -87,7 +77,6 @@ static int
 ecryptfs_miscdev_open(struct inode *inode, struct file *file)
 {
        struct ecryptfs_daemon *daemon = NULL;
-        uid_t euid = current_euid();
        int rc;
        mutex_lock(&ecryptfs_daemon_hash_mux);
@@ -98,30 +87,20 @@ ecryptfs_miscdev_open(struct inode *inode, struct file *file)
                       "count; rc = [%d]\n", __func__, rc);
                goto out_unlock_daemon_list;
        }
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
+        rc = ecryptfs_find_daemon_by_euid(&daemon);
-        if (rc || !daemon) {
+        if (!rc) {
-                rc = ecryptfs_spawn_daemon(&daemon, euid, current_user_ns(),
-                                           task_pid(current));
-                if (rc) {
-                        printk(KERN_ERR "%s: Error attempting to spawn daemon; "
-                               "rc = [%d]\n", __func__, rc);
-                        goto out_module_put_unlock_daemon_list;
-                }
-        }
-        mutex_lock(&daemon->mux);
-        if (daemon->pid != task_pid(current)) {
                rc = -EINVAL;
-                printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], "
+                goto out_unlock_daemon_list;
-                       "but pid [0x%p] has attempted to open the handle "
+        }
-                       "instead\n", __func__, daemon->pid, daemon->euid,
+        rc = ecryptfs_spawn_daemon(&daemon, file);
-                       task_pid(current));
+        if (rc) {
-                goto out_unlock_daemon;
+                printk(KERN_ERR "%s: Error attempting to spawn daemon; "
+                       "rc = [%d]\n", __func__, rc);
+                goto out_module_put_unlock_daemon_list;
        }
+        mutex_lock(&daemon->mux);
        if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) {
                rc = -EBUSY;
-                printk(KERN_ERR "%s: Miscellaneous device handle may only be "
-                       "opened once per daemon; pid [0x%p] already has this "
-                       "handle open\n", __func__, daemon->pid);
                goto out_unlock_daemon;
        }
        daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN;
@@ -140,7 +119,7 @@ out_unlock_daemon_list:
 /**
 * ecryptfs_miscdev_release
 * @inode: inode of fs/ecryptfs/euid handle (ignored)
- * @file: file for fs/ecryptfs/euid handle (ignored)
+ * @file: file for fs/ecryptfs/euid handle
 *
 * This keeps the daemon registered until the daemon sends another
 * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters.
@@ -150,20 +129,18 @@ out_unlock_daemon_list:
 static int
 ecryptfs_miscdev_release(struct inode *inode, struct file *file)
 {
-        struct ecryptfs_daemon *daemon = NULL;
+        struct ecryptfs_daemon *daemon = file->private_data;
-        uid_t euid = current_euid();
        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-        if (rc || !daemon)
-                daemon = file->private_data;
        mutex_lock(&daemon->mux);
        BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN));
        daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN;
        atomic_dec(&ecryptfs_num_miscdev_opens);
        mutex_unlock(&daemon->mux);
+        mutex_lock(&ecryptfs_daemon_hash_mux);
        rc = ecryptfs_exorcise_daemon(daemon);
+        mutex_unlock(&ecryptfs_daemon_hash_mux);
        if (rc) {
                printk(KERN_CRIT "%s: Fatal error whilst attempting to "
                       "shut down daemon; rc = [%d]. Please report this "
@@ -171,7 +148,6 @@ ecryptfs_miscdev_release(struct inode *inode, struct file *file)
                BUG();
        }
        module_put(THIS_MODULE);
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
        return rc;
 }
@@ -248,7 +224,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size,
 /**
 * ecryptfs_miscdev_read - format and send message from queue
- * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
+ * @file: miscdevfs handle
 * @buf: User buffer into which to copy the next message on the daemon queue
 * @count: Amount of space available in @buf
 * @ppos: Offset in file (ignored)
@@ -262,43 +238,27 @@ static ssize_t
 ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
                      loff_t *ppos)
 {
-        struct ecryptfs_daemon *daemon;
+        struct ecryptfs_daemon *daemon = file->private_data;
        struct ecryptfs_msg_ctx *msg_ctx;
        size_t packet_length_size;
        char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
        size_t i;
        size_t total_length;
-        uid_t euid = current_euid();
        int rc;
-        mutex_lock(&ecryptfs_daemon_hash_mux);
-        /* TODO: Just use file->private_data? */
-        rc = ecryptfs_find_daemon_by_euid(&daemon, euid, current_user_ns());
-        if (rc || !daemon) {
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
-                return -EINVAL;
-        }
        mutex_lock(&daemon->mux);
-        if (task_pid(current) != daemon->pid) {
-                mutex_unlock(&daemon->mux);
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
-                return -EPERM;
-        }
        if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
                rc = 0;
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
                printk(KERN_WARNING "%s: Attempt to read from zombified "
                       "daemon\n", __func__);
                goto out_unlock_daemon;
        }
        if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
                rc = 0;
-                mutex_unlock(&ecryptfs_daemon_hash_mux);
                goto out_unlock_daemon;
        }
        /* This daemon will not go away so long as this flag is set */
        daemon->flags |= ECRYPTFS_DAEMON_IN_READ;
-        mutex_unlock(&ecryptfs_daemon_hash_mux);
 check_list:
        if (list_empty(&daemon->msg_ctx_out_queue)) {
                mutex_unlock(&daemon->mux);
@@ -382,16 +342,12 @@ out_unlock_daemon:
 * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
 * @data: Bytes comprising struct ecryptfs_message
 * @data_size: sizeof(struct ecryptfs_message) + data len
- * @euid: Effective user id of miscdevess sending the miscdev response
- * @user_ns: The namespace in which @euid applies
- * @pid: Miscdevess id of miscdevess sending the miscdev response
 * @seq: Sequence number for miscdev response packet
 *
 * Returns zero on success; non-zero otherwise
 */
-static int ecryptfs_miscdev_response(char *data, size_t data_size,
+static int ecryptfs_miscdev_response(struct ecryptfs_daemon *daemon, char *data,
-                                     uid_t euid, struct user_namespace *user_ns,
+                                     size_t data_size, u32 seq)
-                                     struct pid *pid, u32 seq)
 {
        struct ecryptfs_message *msg = (struct ecryptfs_message *)data;
        int rc;
@@ -403,7 +359,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size,
                rc = -EINVAL;
                goto out;
        }
-        rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq);
+        rc = ecryptfs_process_response(daemon, msg, seq);
        if (rc)
                printk(KERN_ERR
                       "Error processing response message; rc = [%d]\n", rc);
@@ -413,7 +369,7 @@ out:
 /**
 * ecryptfs_miscdev_write - handle write to daemon miscdev handle
- * @file: File for misc dev handle (ignored)
+ * @file: File for misc dev handle
 * @buf: Buffer containing user data
 * @count: Amount of data in @buf
 * @ppos: Pointer to offset in file (ignored)
@@ -428,7 +384,6 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
        u32 seq;
        size_t packet_size, packet_size_length;
        char *data;
-        uid_t euid = current_euid();
        unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
        ssize_t rc;
@@ -488,10 +443,9 @@ memdup:
                }
                memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
                seq = be32_to_cpu(counter_nbo);
-                rc = ecryptfs_miscdev_response(
+                rc = ecryptfs_miscdev_response(file->private_data,
                                &data[PKT_LEN_OFFSET + packet_size_length],
-                                packet_size, euid, current_user_ns(),
+                                packet_size, seq);
-                                task_pid(current), seq);
                if (rc) {
                        printk(KERN_WARNING "%s: Failed to deliver miscdev "
                               "response to requesting operation; rc = [%zd]\n",
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index a46b3a8fee1e..bd1d57f98f74 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -66,18 +66,6 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        int rc;
-        /*
-         * Refuse to write the page out if we are called from reclaim context
-         * since our writepage() path may potentially allocate memory when
-         * calling into the lower fs vfs_write() which may in turn invoke
-         * us again.
-         */
-        if (current->flags & PF_MEMALLOC) {
-                redirty_page_for_writepage(wbc, page);
-                rc = 0;
-                goto out;
-        }
        rc = ecryptfs_encrypt_page(page);
        if (rc) {
                ecryptfs_printk(KERN_WARNING, "Error encrypting "
@@ -498,7 +486,6 @@ static int ecryptfs_write_end(struct file *file,
        struct ecryptfs_crypt_stat *crypt_stat =
                &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
        int rc;
-        int need_unlock_page = 1;
        ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
                        "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
@@ -519,26 +506,26 @@ static int ecryptfs_write_end(struct file *file,
                        "zeros in page with index = [0x%.16lx]\n", index);
                goto out;
        }
-        set_page_dirty(page);
+        rc = ecryptfs_encrypt_page(page);
-        unlock_page(page);
+        if (rc) {
-        need_unlock_page = 0;
+                ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
+                                "index [0x%.16lx])\n", index);
+                goto out;
+        }
        if (pos + copied > i_size_read(ecryptfs_inode)) {
                i_size_write(ecryptfs_inode, pos + copied);
                ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
                        "[0x%.16llx]\n",
                        (unsigned long long)i_size_read(ecryptfs_inode));
-                balance_dirty_pages_ratelimited(mapping);
-                rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
-                if (rc) {
-                        printk(KERN_ERR "Error writing inode size to metadata; "
-                               "rc = [%d]\n", rc);
-                        goto out;
-                }
        }
-        rc = copied;
+        rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
+        if (rc)
+                printk(KERN_ERR "Error writing inode size to metadata; "
+                       "rc = [%d]\n", rc);
+        else
+                rc = copied;
 out:
-        if (need_unlock_page)
+        unlock_page(page);
-                unlock_page(page);
        page_cache_release(page);
        return rc;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1c8b55670804..eedec84c1809 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1654,8 +1654,8 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
                error = PTR_ERR(file);
                goto out_free_fd;
        }
-        fd_install(fd, file);
        ep->file = file;
+        fd_install(fd, file);
        return fd;
 out_free_fd:
diff --git a/fs/exec.c b/fs/exec.c
index 3684353ebd5f..574cf4de4ec3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2069,25 +2069,18 @@ static void wait_for_dump_helpers(struct file *file)
 */
 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
-        struct file *rp, *wp;
+        struct file *files[2];
        struct fdtable *fdt;
        struct coredump_params *cp = (struct coredump_params *)info->data;
        struct files_struct *cf = current->files;
+        int err = create_pipe_files(files, 0);
+        if (err)
+                return err;
-        wp = create_write_pipe(0);
+        cp->file = files[1];
-        if (IS_ERR(wp))
-                return PTR_ERR(wp);
-        rp = create_read_pipe(wp, 0);
-        if (IS_ERR(rp)) {
-                free_write_pipe(wp);
-                return PTR_ERR(rp);
-        }
-        cp->file = wp;
        sys_close(0);
-        fd_install(0, rp);
+        fd_install(0, files[0]);
        spin_lock(&cf->file_lock);
        fdt = files_fdtable(cf);
        __set_open_fd(0, fdt);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 5badb0c039de..1562c27a2fab 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,12 @@
 #define EXOFS_DBGMSG2(M...) do {} while (0)
-enum {MAX_PAGES_KMALLOC = PAGE_SIZE / sizeof(struct page *), };
 unsigned exofs_max_io_pages(struct ore_layout *layout,
                            unsigned expected_pages)
 {
-        unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
+        unsigned pages = min_t(unsigned, expected_pages,
+                               layout->max_io_length / PAGE_SIZE);
-        /* TODO: easily support bio chaining */
-        pages =  min_t(unsigned, pages, layout->max_io_length / PAGE_SIZE);
        return pages;
 }
@@ -101,7 +98,8 @@ static void _pcol_reset(struct page_collect *pcol)
         * it might not end here. don't be left with nothing
         */
        if (!pcol->expected_pages)
-                pcol->expected_pages = MAX_PAGES_KMALLOC;
+                pcol->expected_pages =
+                                exofs_max_io_pages(&pcol->sbi->layout, ~0);
 }
 static int pcol_try_alloc(struct page_collect *pcol)
@@ -389,6 +387,8 @@ static int readpage_strip(void *data, struct page *page)
        size_t len;
        int ret;
+        BUG_ON(!PageLocked(page));
        /* FIXME: Just for debugging, will be removed */
        if (PageUptodate(page))
                EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
@@ -572,8 +572,16 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
        if (!pcol->that_locked_page ||
            (pcol->that_locked_page->index != index)) {
-                struct page *page = find_get_page(pcol->inode->i_mapping, index);
+                struct page *page;
+                loff_t i_size = i_size_read(pcol->inode);
+                if (offset >= i_size) {
+                        *uptodate = true;
+                        EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
+                        return ZERO_PAGE(0);
+                }
+                page =  find_get_page(pcol->inode->i_mapping, index);
                if (!page) {
                        page = find_or_create_page(pcol->inode->i_mapping,
                                                   index, GFP_NOFS);
@@ -602,12 +610,13 @@ static void __r4w_put_page(void *priv, struct page *page)
 {
        struct page_collect *pcol = priv;
-        if (pcol->that_locked_page != page) {
+        if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
                EXOFS_DBGMSG("index=0x%lx\n", page->index);
                page_cache_release(page);
                return;
        }
-        EXOFS_DBGMSG("that_locked_page index=0x%lx\n", page->index);
+        EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
+                     ZERO_PAGE(0) == page ? -1 : page->index);
 }
 static const struct _ore_r4w_op _r4w_op = {
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index 24a49d47e935..1585db1aa365 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -837,11 +837,11 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                                bio->bi_rw |= REQ_WRITE;
                        }
-                        osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
+                        osd_req_write(or, _ios_obj(ios, cur_comp),
-                                      bio, per_dev->length);
+                                      per_dev->offset, bio, per_dev->length);
                        ORE_DBGMSG("write(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     _LLU(per_dev->offset),
                                     _LLU(per_dev->length), dev);
                } else if (ios->kern_buff) {
@@ -853,20 +853,20 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp)
                               (ios->si.unit_off + ios->length >
                                ios->layout->stripe_unit));
-                        ret = osd_req_write_kern(or, _ios_obj(ios, per_dev->dev),
+                        ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp),
                                                 per_dev->offset,
                                                 ios->kern_buff, ios->length);
                        if (unlikely(ret))
                                goto out;
                        ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
                                      "length=0x%llx dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     _LLU(per_dev->offset),
                                     _LLU(ios->length), per_dev->dev);
                } else {
-                        osd_req_set_attributes(or, _ios_obj(ios, dev));
+                        osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
                        ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
-                                     _LLU(_ios_obj(ios, dev)->id),
+                                     _LLU(_ios_obj(ios, cur_comp)->id),
                                     ios->out_attr_len, dev);
                }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 433783624d10..dde41a75c7c8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -400,8 +400,6 @@ static int exofs_sync_fs(struct super_block *sb, int wait)
        ret = ore_write(ios);
        if (unlikely(ret))
                EXOFS_ERR("%s: ore_write failed.\n", __func__);
-        else
-                sb->s_dirt = 0;
        unlock_super(sb);
@@ -412,14 +410,6 @@ out:
        return ret;
 }
-static void exofs_write_super(struct super_block *sb)
-{
-        if (!(sb->s_flags & MS_RDONLY))
-                exofs_sync_fs(sb, 1);
-        else
-                sb->s_dirt = 0;
-}
 static void _exofs_print_device(const char *msg, const char *dev_path,
                                struct osd_dev *od, u64 pid)
 {
@@ -952,7 +942,6 @@ static const struct super_operations exofs_sops = {
        .write_inode    = exofs_write_inode,
        .evict_inode    = exofs_evict_inode,
        .put_super      = exofs_put_super,
-        .write_super    = exofs_write_super,
        .sync_fs        = exofs_sync_fs,
        .statfs         = exofs_statfs,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 264d315f6c47..6363ac66fafa 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -79,6 +79,7 @@ void ext2_evict_inode(struct inode * inode)
        truncate_inode_pages(&inode->i_data, 0);
        if (want_delete) {
+                sb_start_intwrite(inode->i_sb);
                /* set dtime */
                EXT2_I(inode)->i_dtime  = get_seconds();
                mark_inode_dirty(inode);
@@ -98,8 +99,10 @@ void ext2_evict_inode(struct inode * inode)
        if (unlikely(rsv))
                kfree(rsv);
-        if (want_delete)
+        if (want_delete) {
                ext2_free_inode(inode);
+                sb_end_intwrite(inode->i_sb);
+        }
 }
 typedef struct {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9f311d27b16f..af74d9e27b71 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,8 @@ static void ext2_sync_super(struct super_block *sb,
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
+static int ext2_freeze(struct super_block *sb);
+static int ext2_unfreeze(struct super_block *sb);
 void ext2_error(struct super_block *sb, const char *function,
                const char *fmt, ...)
@@ -305,6 +307,8 @@ static const struct super_operations ext2_sops = {
        .evict_inode    = ext2_evict_inode,
        .put_super      = ext2_put_super,
        .sync_fs        = ext2_sync_fs,
+        .freeze_fs      = ext2_freeze,
+        .unfreeze_fs    = ext2_unfreeze,
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
        .show_options   = ext2_show_options,
@@ -1200,6 +1204,35 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int ext2_freeze(struct super_block *sb)
+{
+        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        /*
+         * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared
+         * because we have unattached inodes and thus filesystem is not fully
+         * consistent.
+         */
+        if (atomic_long_read(&sb->s_remove_count)) {
+                ext2_sync_fs(sb, 1);
+                return 0;
+        }
+        /* Set EXT2_FS_VALID flag */
+        spin_lock(&sbi->s_lock);
+        sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state);
+        spin_unlock(&sbi->s_lock);
+        ext2_sync_super(sb, sbi->s_es, 1);
+        return 0;
+}
+static int ext2_unfreeze(struct super_block *sb)
+{
+        /* Just write sb to clear EXT2_VALID_FS flag */
+        ext2_write_super(sb);
+        return 0;
+}
 void ext2_write_super(struct super_block *sb)
 {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 9a4a5c48b1c9..ff574b4e345e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3072,6 +3072,8 @@ static int ext3_do_update_inode(handle_t *handle,
        struct ext3_inode_info *ei = EXT3_I(inode);
        struct buffer_head *bh = iloc->bh;
        int err = 0, rc, block;
+        int need_datasync = 0;
+        __le32 disksize;
        uid_t i_uid;
        gid_t i_gid;
@@ -3113,7 +3115,11 @@ again:
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
-        raw_inode->i_size = cpu_to_le32(ei->i_disksize);
+        disksize = cpu_to_le32(ei->i_disksize);
+        if (disksize != raw_inode->i_size) {
+                need_datasync = 1;
+                raw_inode->i_size = disksize;
+        }
        raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
        raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
        raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
@@ -3129,8 +3135,11 @@ again:
        if (!S_ISREG(inode->i_mode)) {
                raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
        } else {
-                raw_inode->i_size_high =
+                disksize = cpu_to_le32(ei->i_disksize >> 32);
-                        cpu_to_le32(ei->i_disksize >> 32);
+                if (disksize != raw_inode->i_size_high) {
+                        raw_inode->i_size_high = disksize;
+                        need_datasync = 1;
+                }
                if (ei->i_disksize > 0x7fffffffULL) {
                        struct super_block *sb = inode->i_sb;
                        if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
@@ -3183,6 +3192,8 @@ again:
        ext3_clear_inode_state(inode, EXT3_STATE_NEW);
        atomic_set(&ei->i_sync_tid, handle->h_transaction->t_tid);
+        if (need_datasync)
+                atomic_set(&ei->i_datasync_tid, handle->h_transaction->t_tid);
 out_brelse:
        brelse (bh);
        ext3_std_error(inode->i_sb, err);
@@ -3459,14 +3470,6 @@ ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
- *
- * Is this efficient/effective?  Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O.  But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out.  One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory.  It has the desired
- * effect.
 */
 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index ff9bcdc5b0d5..8c892e93d8e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -64,11 +64,6 @@ static int ext3_freeze(struct super_block *sb);
 /*
 * Wrappers for journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
 */
 handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
 {
@@ -90,12 +85,6 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks)
        return journal_start(journal, nblocks);
 }
-/*
- * The only special thing we need to do here is to make sure that all
- * journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
 int __ext3_journal_stop(const char *where, handle_t *handle)
 {
        struct super_block *sb;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d23b31ca9d7a..1b5089067d01 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -280,14 +280,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
        return desc;
 }
-static int ext4_valid_block_bitmap(struct super_block *sb,
+/*
-                                   struct ext4_group_desc *desc,
+ * Return the block number which was discovered to be invalid, or 0 if
-                                   unsigned int block_group,
+ * the block bitmap is valid.
-                                   struct buffer_head *bh)
+ */
+static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
+                                            struct ext4_group_desc *desc,
+                                            unsigned int block_group,
+                                            struct buffer_head *bh)
 {
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
-        ext4_fsblk_t bitmap_blk;
+        ext4_fsblk_t blk;
        ext4_fsblk_t group_first_block;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
@@ -297,37 +301,33 @@ static int ext4_valid_block_bitmap(struct super_block *sb,
                 * or it has to also read the block group where the bitmaps
                 * are located to verify they are set.
                 */
-                return 1;
+                return 0;
        }
        group_first_block = ext4_group_first_block_no(sb, block_group);
        /* check whether block bitmap block number is set */
-        bitmap_blk = ext4_block_bitmap(sb, desc);
+        blk = ext4_block_bitmap(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        if (!ext4_test_bit(offset, bh->b_data))
                /* bad block bitmap */
-                goto err_out;
+                return blk;
        /* check whether the inode bitmap block number is set */
-        bitmap_blk = ext4_inode_bitmap(sb, desc);
+        blk = ext4_inode_bitmap(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        if (!ext4_test_bit(offset, bh->b_data))
                /* bad block bitmap */
-                goto err_out;
+                return blk;
        /* check whether the inode table block number is set */
-        bitmap_blk = ext4_inode_table(sb, desc);
+        blk = ext4_inode_table(sb, desc);
-        offset = bitmap_blk - group_first_block;
+        offset = blk - group_first_block;
        next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
                                offset + EXT4_SB(sb)->s_itb_per_group,
                                offset);
-        if (next_zero_bit >= offset + EXT4_SB(sb)->s_itb_per_group)
+        if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
-                /* good bitmap for inode tables */
+                /* bad bitmap for inode tables */
-                return 1;
+                return blk;
-err_out:
-        ext4_error(sb, "Invalid block bitmap - block_group = %d, block = %llu",
-                        block_group, bitmap_blk);
        return 0;
 }
@@ -336,14 +336,26 @@ void ext4_validate_block_bitmap(struct super_block *sb,
                               unsigned int block_group,
                               struct buffer_head *bh)
 {
+        ext4_fsblk_t    blk;
        if (buffer_verified(bh))
                return;
        ext4_lock_group(sb, block_group);
-        if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
+        blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
-            ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
+        if (unlikely(blk != 0)) {
-                                          EXT4_BLOCKS_PER_GROUP(sb) / 8))
+                ext4_unlock_group(sb, block_group);
-                set_buffer_verified(bh);
+                ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+                           block_group, blk);
+                return;
+        }
+        if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+                        desc, bh, EXT4_BLOCKS_PER_GROUP(sb) / 8))) {
+                ext4_unlock_group(sb, block_group);
+                ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+                return;
+        }
+        set_buffer_verified(bh);
        ext4_unlock_group(sb, block_group);
 }
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index f8716eab9995..5c2d1813ebe9 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -79,7 +79,6 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
        if (provided == calculated)
                return 1;
-        ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
        return 0;
 }
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cd0c7ed06772..aabbb3f53683 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2662,6 +2662,7 @@ cont:
                }
                path[0].p_depth = depth;
                path[0].p_hdr = ext_inode_hdr(inode);
+                i = 0;
                if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                        err = -EIO;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 89b59cb7f9b8..dff171c3a123 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -233,6 +233,11 @@ void ext4_evict_inode(struct inode *inode)
        if (is_bad_inode(inode))
                goto no_delete;
+        /*
+         * Protect us against freezing - iput() caller didn't have to have any
+         * protection against it
+         */
+        sb_start_intwrite(inode->i_sb);
        handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -242,6 +247,7 @@ void ext4_evict_inode(struct inode *inode)
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
+                sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }
@@ -273,6 +279,7 @@ void ext4_evict_inode(struct inode *inode)
                stop_handle:
                        ext4_journal_stop(handle);
                        ext4_orphan_del(NULL, inode);
+                        sb_end_intwrite(inode->i_sb);
                        goto no_delete;
                }
        }
@@ -301,6 +308,7 @@ void ext4_evict_inode(struct inode *inode)
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
+        sb_end_intwrite(inode->i_sb);
        return;
 no_delete:
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
@@ -1962,7 +1970,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
 * This function can get called via...
 *   - ext4_da_writepages after taking page lock (have journal handle)
 *   - journal_submit_inode_data_buffers (no journal handle)
- *   - shrink_page_list via pdflush (no journal handle)
+ *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
 *   - grab_page_cache when doing write_begin (have journal handle)
 *
 * We don't do any block allocation in this function. If we have page with
@@ -4581,14 +4589,6 @@ static int ext4_expand_extra_isize(struct inode *inode,
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
- *
- * Is this efficient/effective?  Well, we're being nice to the system
- * by cleaning up our inodes proactively so they can be reaped
- * without I/O.  But we are potentially leaving up to five seconds'
- * worth of inodes floating about which prune_icache wants us to
- * write out.  One way to fix that would be to get prune_icache()
- * to do a write_super() to free up some memory.  It has the desired
- * effect.
 */
 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 {
@@ -4779,11 +4779,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        get_block_t *get_block;
        int retries = 0;
-        /*
+        sb_start_pagefault(inode->i_sb);
-         * This check is racy but catches the common case. We rely on
-         * __block_page_mkwrite() to do a reliable check.
-         */
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_should_journal_data(inode) &&
@@ -4851,5 +4847,6 @@ retry_alloc:
 out_ret:
        ret = block_page_mkwrite_return(ret);
 out:
+        sb_end_pagefault(inode->i_sb);
        return ret;
 }
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index f99a1311e847..fe7c63f4717e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -44,6 +44,11 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 {
        struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+        /*
+         * We protect against freezing so that we don't create dirty buffers
+         * on frozen filesystem.
+         */
+        sb_start_write(sb);
        ext4_mmp_csum_set(sb, mmp);
        mark_buffer_dirty(bh);
        lock_buffer(bh);
@@ -51,6 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
        get_bh(bh);
        submit_bh(WRITE_SYNC, bh);
        wait_on_buffer(bh);
+        sb_end_write(sb);
        if (unlikely(!buffer_uptodate(bh)))
                return 1;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2d51cd9af225..c6e0cb3d1f4a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -326,38 +326,17 @@ static void ext4_put_nojournal(handle_t *handle)
 /*
 * Wrappers for jbd2_journal_start/end.
- *
- * The only special thing we need to do here is to make sure that all
- * journal_end calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- *
- * To avoid j_barrier hold in userspace when a user calls freeze(),
- * ext4 prevents a new handle from being started by s_frozen, which
- * is in an upper layer.
 */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
        journal_t *journal;
-        handle_t  *handle;
        trace_ext4_journal_start(sb, nblocks, _RET_IP_);
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
+        WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
        journal = EXT4_SB(sb)->s_journal;
-        handle = ext4_journal_current_handle();
-        /*
-         * If a handle has been started, it should be allowed to
-         * finish, otherwise deadlock could happen between freeze
-         * and others(e.g. truncate) due to the restart of the
-         * journal handle if the filesystem is forzen and active
-         * handles are not stopped.
-         */
-        if (!handle)
-                vfs_check_frozen(sb, SB_FREEZE_TRANS);
        if (!journal)
                return ext4_get_nojournal();
        /*
@@ -372,12 +351,6 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        return jbd2_journal_start(journal, nblocks);
 }
-/*
- * The only special thing we need to do here is to make sure that all
- * jbd2_journal_stop calls result in the superblock being marked dirty, so
- * that sync() will call the filesystem's write_super callback if
- * appropriate.
- */
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
 {
        struct super_block *sb;
@@ -975,6 +948,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
        ei->i_da_metadata_calc_len = 0;
+        ei->i_da_metadata_calc_last_lblock = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
 #ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
@@ -2747,6 +2721,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
        sb = elr->lr_super;
        ngroups = EXT4_SB(sb)->s_groups_count;
+        sb_start_write(sb);
        for (group = elr->lr_next_group; group < ngroups; group++) {
                gdp = ext4_get_group_desc(sb, group, NULL);
                if (!gdp) {
@@ -2773,6 +2748,7 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
                elr->lr_next_sched = jiffies + elr->lr_timeout;
                elr->lr_next_group = group + 1;
        }
+        sb_end_write(sb);
        return ret;
 }
@@ -3133,6 +3109,10 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
        ext4_group_t            i, ngroups = ext4_get_groups_count(sb);
        int                     s, j, count = 0;
+        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+                return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+                        sbi->s_itb_per_group + 2);
        first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
                (grp * EXT4_BLOCKS_PER_GROUP(sb));
        last_block = first_block + EXT4_BLOCKS_PER_GROUP(sb) - 1;
@@ -4444,6 +4424,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
                ext4_commit_super(sb, 1);
                jbd2_journal_clear_err(journal);
+                jbd2_journal_update_sb_errno(journal);
        }
 }
@@ -4460,10 +4441,8 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal) {
+        if (journal)
-                vfs_check_frozen(sb, SB_FREEZE_TRANS);
                ret = ext4_journal_force_commit(journal);
-        }
        return ret;
 }
@@ -4493,9 +4472,8 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 * gives us a chance to flush the journal completely and mark the fs clean.
 *
 * Note that only this function cannot bring a filesystem to be in a clean
- * state independently, because ext4 prevents a new handle from being started
+ * state independently. It relies on upper layer to stop all data & metadata
- * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
+ * modifications.
- * the upper layer.
 */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4522,7 +4500,7 @@ static int ext4_freeze(struct super_block *sb)
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
 out:
-        /* we rely on s_frozen to stop further updates */
+        /* we rely on upper layer to stop further updates */
        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return error;
 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a71fe3715ee8..e007b8bd8e5e 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -43,10 +43,10 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
        if (err)
                goto out;
-        mutex_lock(&inode->i_mutex);
        err = mnt_want_write_file(file);
        if (err)
-                goto out_unlock_inode;
+                goto out;
+        mutex_lock(&inode->i_mutex);
        /*
         * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -73,14 +73,14 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
        /* The root directory has no attributes */
        if (inode->i_ino == MSDOS_ROOT_INO && attr != ATTR_DIR) {
                err = -EINVAL;
-                goto out_drop_write;
+                goto out_unlock_inode;
        }
        if (sbi->options.sys_immutable &&
            ((attr | oldattr) & ATTR_SYS) &&
            !capable(CAP_LINUX_IMMUTABLE)) {
                err = -EPERM;
-                goto out_drop_write;
+                goto out_unlock_inode;
        }
        /*
@@ -90,12 +90,12 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
         */
        err = security_inode_setattr(file->f_path.dentry, &ia);
        if (err)
-                goto out_drop_write;
+                goto out_unlock_inode;
        /* This MUST be done before doing anything irreversible... */
        err = fat_setattr(file->f_path.dentry, &ia);
        if (err)
-                goto out_drop_write;
+                goto out_unlock_inode;
        fsnotify_change(file->f_path.dentry, ia.ia_valid);
        if (sbi->options.sys_immutable) {
@@ -107,10 +107,9 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
        fat_save_attrs(inode, attr);
        mark_inode_dirty(inode);
-out_drop_write:
-        mnt_drop_write_file(file);
 out_unlock_inode:
        mutex_unlock(&inode->i_mutex);
+        mnt_drop_write_file(file);
 out:
        return err;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index b3fc4d67a26b..701985e4ccda 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -43,7 +43,7 @@ static struct kmem_cache *filp_cachep __read_mostly;
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
-static inline void file_free_rcu(struct rcu_head *head)
+static void file_free_rcu(struct rcu_head *head)
 {
        struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
@@ -217,7 +217,7 @@ static void drop_file_write_access(struct file *file)
                return;
        if (file_check_writeable(file) != 0)
                return;
-        mnt_drop_write(mnt);
+        __mnt_drop_write(mnt);
        file_release_write(file);
 }
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 03ff5b1eba93..75a20c092dd4 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -117,7 +117,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
                                              const char __user *buf,
                                              size_t count, loff_t *ppos)
 {
-        unsigned val;
+        unsigned uninitialized_var(val);
        ssize_t ret;
        ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -154,7 +154,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
                                                    const char __user *buf,
                                                    size_t count, loff_t *ppos)
 {
-        unsigned val;
+        unsigned uninitialized_var(val);
        ssize_t ret;
        ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3426521f3205..ee8d55042298 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -396,7 +396,7 @@ err_device:
 err_region:
        unregister_chrdev_region(devt, 1);
 err:
-        fc->conn_error = 1;
+        fuse_conn_kill(fc);
        goto out;
 }
@@ -532,8 +532,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
                cdev_del(cc->cdev);
        }
-        /* kill connection and shutdown channel */
-        fuse_conn_kill(&cc->fc);
        rc = fuse_dev_release(inode, file);     /* puts the base reference */
        return rc;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 7df2b5e8fbe1..f4246cfc8d87 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1576,6 +1576,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
                req->pages[req->num_pages] = page;
                req->num_pages++;
+                offset = 0;
                num -= this_num;
                total_len += this_num;
                index++;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 8964cf3999b2..324bc0850534 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -383,6 +383,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
        struct fuse_entry_out outentry;
        struct fuse_file *ff;
+        /* Userspace expects S_IFREG in create mode */
+        BUG_ON((mode & S_IFMT) != S_IFREG);
        forget = fuse_alloc_forget();
        err = -ENOMEM;
        if (!forget)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index b321a688cde7..aba15f1b7ad2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -703,13 +703,16 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                                  unsigned long nr_segs, loff_t pos)
 {
        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        struct fuse_conn *fc = get_fuse_conn(inode);
-        if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+        /*
+         * In auto invalidate mode, always update attributes on read.
+         * Otherwise, only update if we attempt to read past EOF (to ensure
+         * i_size is up to date).
+         */
+        if (fc->auto_inval_data ||
+            (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
                int err;
-                /*
-                 * If trying to read past EOF, make sure the i_size
-                 * attribute is up-to-date.
-                 */
                err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
                if (err)
                        return err;
@@ -944,9 +947,8 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                return err;
        count = ocount;
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -1004,6 +1006,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 out:
        current->backing_dev_info = NULL;
        mutex_unlock(&inode->i_mutex);
+        sb_end_write(inode->i_sb);
        return written ? written : err;
 }
@@ -1700,7 +1703,7 @@ static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
        size_t n;
        u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
-        for (n = 0; n < count; n++) {
+        for (n = 0; n < count; n++, iov++) {
                if (iov->iov_len > (size_t) max)
                        return -ENOMEM;
                max -= iov->iov_len;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 771fb6322c07..e24dd74e3068 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -484,6 +484,9 @@ struct fuse_conn {
        /** Is fallocate not implemented by fs? */
        unsigned no_fallocate:1;
+        /** Use enhanced/automatic page cache invalidation. */
+        unsigned auto_inval_data:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 1cd61652018c..fca222dabe3c 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -197,6 +197,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_inode *fi = get_fuse_inode(inode);
        loff_t oldsize;
+        struct timespec old_mtime;
        spin_lock(&fc->lock);
        if (attr_version != 0 && fi->attr_version > attr_version) {
@@ -204,15 +205,35 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
                return;
        }
+        old_mtime = inode->i_mtime;
        fuse_change_attributes_common(inode, attr, attr_valid);
        oldsize = inode->i_size;
        i_size_write(inode, attr->size);
        spin_unlock(&fc->lock);
-        if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
+        if (S_ISREG(inode->i_mode)) {
-                truncate_pagecache(inode, oldsize, attr->size);
+                bool inval = false;
-                invalidate_inode_pages2(inode->i_mapping);
+                if (oldsize != attr->size) {
+                        truncate_pagecache(inode, oldsize, attr->size);
+                        inval = true;
+                } else if (fc->auto_inval_data) {
+                        struct timespec new_mtime = {
+                                .tv_sec = attr->mtime,
+                                .tv_nsec = attr->mtimensec,
+                        };
+                        /*
+                         * Auto inval mode also checks and invalidates if mtime
+                         * has changed.
+                         */
+                        if (!timespec_equal(&old_mtime, &new_mtime))
+                                inval = true;
+                }
+                if (inval)
+                        invalidate_inode_pages2(inode->i_mapping);
        }
 }
@@ -346,11 +367,6 @@ void fuse_conn_kill(struct fuse_conn *fc)
        wake_up_all(&fc->waitq);
        wake_up_all(&fc->blocked_waitq);
        wake_up_all(&fc->reserved_req_waitq);
-        mutex_lock(&fuse_mutex);
-        list_del(&fc->entry);
-        fuse_ctl_remove_conn(fc);
-        mutex_unlock(&fuse_mutex);
-        fuse_bdi_destroy(fc);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_kill);
@@ -359,7 +375,14 @@ static void fuse_put_super(struct super_block *sb)
        struct fuse_conn *fc = get_fuse_conn_super(sb);
        fuse_send_destroy(fc);
        fuse_conn_kill(fc);
+        mutex_lock(&fuse_mutex);
+        list_del(&fc->entry);
+        fuse_ctl_remove_conn(fc);
+        mutex_unlock(&fuse_mutex);
+        fuse_bdi_destroy(fc);
        fuse_conn_put(fc);
 }
@@ -834,6 +857,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->big_writes = 1;
                        if (arg->flags & FUSE_DONT_MASK)
                                fc->dont_mask = 1;
+                        if (arg->flags & FUSE_AUTO_INVAL_DATA)
+                                fc->auto_inval_data = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -859,7 +884,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
-                FUSE_FLOCK_LOCKS;
+                FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+                FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9aa6af13823c..d1d791ef38de 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -373,11 +373,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        loff_t size;
        int ret;
-        /* Wait if fs is frozen. This is racy so we check again later on
+        sb_start_pagefault(inode->i_sb);
-         * and retry if the fs has been frozen after the page lock has
-         * been acquired
+        /* Update file times before taking page lock */
-         */
+        file_update_time(vma->vm_file);
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        ret = gfs2_rs_alloc(ip);
        if (ret)
@@ -462,14 +461,9 @@ out:
        gfs2_holder_uninit(&gh);
        if (ret == 0) {
                set_page_dirty(page);
-                /* This check must be post dropping of transaction lock */
+                wait_on_page_writeback(page);
-                if (inode->i_sb->s_frozen == SB_UNFROZEN) {
-                        wait_on_page_writeback(page);
-                } else {
-                        ret = -EAGAIN;
-                        unlock_page(page);
-                }
        }
+        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(ret);
 }
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 3a56c8d94de0..22255d96b27e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -52,7 +52,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the page.  Note that this can
-                 * potentially cause a busy-wait loop from pdflush and kswapd
+                 * potentially cause a busy-wait loop from flusher thread and kswapd
                 * activity, but those code paths have their own higher-level
                 * throttling.
                 */
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index ad3e2fb763d7..adbd27875ef9 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -50,6 +50,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
        if (revokes)
                tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
                                                   sizeof(u64));
+        sb_start_intwrite(sdp->sd_vfs);
        gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
        error = gfs2_glock_nq(&tr->tr_t_gh);
@@ -68,6 +69,7 @@ fail_gunlock:
        gfs2_glock_dq(&tr->tr_t_gh);
 fail_holder_uninit:
+        sb_end_intwrite(sdp->sd_vfs);
        gfs2_holder_uninit(&tr->tr_t_gh);
        kfree(tr);
@@ -116,6 +118,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
                        gfs2_holder_uninit(&tr->tr_t_gh);
                        kfree(tr);
                }
+                sb_end_intwrite(sdp->sd_vfs);
                return;
        }
@@ -136,6 +139,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
        if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
                gfs2_log_flush(sdp, NULL);
+        sb_end_intwrite(sdp->sd_vfs);
 }
 /**
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 5fd51a5833ff..b7ec224910c5 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -236,10 +236,10 @@ out:
 * hfs_mdb_commit()
 *
 * Description:
- *   This updates the MDB on disk (look also at hfs_write_super()).
+ *   This updates the MDB on disk.
 *   It does not check, if the superblock has been modified, or
 *   if the filesystem has been mounted read-only. It is mainly
- *   called by hfs_write_super() and hfs_btree_extend().
+ *   called by hfs_sync_fs() and flush_mdb().
 * Input Variable(s):
 *   struct hfs_mdb *mdb: Pointer to the hfs MDB
 *   int backup;
diff --git a/fs/inode.c b/fs/inode.c
index 3cc504320467..ac8d904b3f16 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1542,9 +1542,11 @@ void touch_atime(struct path *path)
        if (timespec_equal(&inode->i_atime, &now))
                return;
-        if (mnt_want_write(mnt))
+        if (!sb_start_write_trylock(inode->i_sb))
                return;
+        if (__mnt_want_write(mnt))
+                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
@@ -1555,7 +1557,9 @@ void touch_atime(struct path *path)
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        update_time(inode, &now, S_ATIME);
-        mnt_drop_write(mnt);
+        __mnt_drop_write(mnt);
+skip_update:
+        sb_end_write(inode->i_sb);
 }
 EXPORT_SYMBOL(touch_atime);
@@ -1662,11 +1666,11 @@ int file_update_time(struct file *file)
                return 0;
        /* Finally allowed to write? Takes lock. */
-        if (mnt_want_write_file(file))
+        if (__mnt_want_write_file(file))
                return 0;
        ret = update_time(inode, &now, sync_it);
-        mnt_drop_write_file(file);
+        __mnt_drop_write_file(file);
        return ret;
 }
diff --git a/fs/internal.h b/fs/internal.h
index a6fd56c68b11..371bcc4b1697 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -61,6 +61,10 @@ extern void __init mnt_init(void);
 extern struct lglock vfsmount_lock;
+extern int __mnt_want_write(struct vfsmount *);
+extern int __mnt_want_write_file(struct file *);
+extern void __mnt_drop_write(struct vfsmount *);
+extern void __mnt_drop_write_file(struct file *);
 /*
 * fs_struct.c
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 425c2f2cf170..a2862339323b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -534,8 +534,8 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
-                 * If ext3_write_super() recently started a commit, then we
+                 * If commit has been started, then we have to wait for
-                 * have to wait for completion of that transaction
+                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
@@ -1113,6 +1113,11 @@ static void mark_journal_empty(journal_t *journal)
        BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
        spin_lock(&journal->j_state_lock);
+        /* Is it already empty? */
+        if (sb->s_start == 0) {
+                spin_unlock(&journal->j_state_lock);
+                return;
+        }
        jbd_debug(1, "JBD: Marking journal as empty (seq %d)\n",
                  journal->j_tail_sequence);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e9a3c4c85594..e149b99a7ffb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -612,8 +612,8 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
                ret = 1;
        } else if (journal->j_committing_transaction) {
                /*
-                 * If ext3_write_super() recently started a commit, then we
+                 * If commit has been started, then we have to wait for
-                 * have to wait for completion of that transaction
+                 * completion of that transaction.
                 */
                if (ptid)
                        *ptid = journal->j_committing_transaction->t_tid;
@@ -1377,7 +1377,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
 * Update a journal's errno.  Write updated superblock to disk waiting for IO
 * to complete.
 */
-static void jbd2_journal_update_sb_errno(journal_t *journal)
+void jbd2_journal_update_sb_errno(journal_t *journal)
 {
        journal_superblock_t *sb = journal->j_superblock;
@@ -1390,6 +1390,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
        jbd2_write_superblock(journal, WRITE_SYNC);
 }
+EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 /*
 * Read the superblock for a given journal, performing initial
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 8392cb85bd54..05d29124c6ab 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -156,12 +156,16 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl)
        struct nlm_rqst         *call;
        int                     status;
-        nlm_get_host(host);
        call = nlm_alloc_call(host);
        if (call == NULL)
                return -ENOMEM;
        nlmclnt_locks_init_private(fl, host);
+        if (!fl->fl_u.nfs_fl.owner) {
+                /* lockowner allocation has failed */
+                nlmclnt_release_call(call);
+                return -ENOMEM;
+        }
        /* Set up the argument struct */
        nlmclnt_setlockargs(call, fl);
@@ -185,9 +189,6 @@ EXPORT_SYMBOL_GPL(nlmclnt_proc);
 /*
 * Allocate an NLM RPC call struct
- *
- * Note: the caller must hold a reference to host. In case of failure,
- * this reference will be released.
 */
 struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
 {
@@ -199,7 +200,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                        atomic_set(&call->a_count, 1);
                        locks_init_lock(&call->a_args.lock.fl);
                        locks_init_lock(&call->a_res.lock.fl);
-                        call->a_host = host;
+                        call->a_host = nlm_get_host(host);
                        return call;
                }
                if (signalled())
@@ -207,7 +208,6 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host)
                printk("nlm_alloc_call: failed, waiting for memory\n");
                schedule_timeout_interruptible(5*HZ);
        }
-        nlmclnt_release_host(host);
        return NULL;
 }
@@ -750,7 +750,7 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
        dprintk("lockd: blocking lock attempt was interrupted by a signal.\n"
                "       Attempting to cancel lock.\n");
-        req = nlm_alloc_call(nlm_get_host(host));
+        req = nlm_alloc_call(host);
        if (!req)
                return -ENOMEM;
        req->a_flags = RPC_TASK_ASYNC;
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 4a43d253c045..b147d1ae71fd 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -257,6 +257,7 @@ static __be32 nlm4svc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
                return rpc_system_err;
        call = nlm_alloc_call(host);
+        nlmsvc_release_host(host);
        if (call == NULL)
                return rpc_system_err;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index afe4488c33d8..fb1a2bedbe97 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -219,7 +219,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
        struct nlm_block        *block;
        struct nlm_rqst         *call = NULL;
-        nlm_get_host(host);
        call = nlm_alloc_call(host);
        if (call == NULL)
                return NULL;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index de8f2caa2235..3009a365e082 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -297,6 +297,7 @@ static __be32 nlmsvc_callback(struct svc_rqst *rqstp, u32 proc, struct nlm_args
                return rpc_system_err;
        call = nlm_alloc_call(host);
+        nlmsvc_release_host(host);
        if (call == NULL)
                return rpc_system_err;
diff --git a/fs/locks.c b/fs/locks.c
index cdcf219a7391..7e81bfc75164 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -200,11 +200,7 @@ void locks_release_private(struct file_lock *fl)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }
-        if (fl->fl_lmops) {
+        fl->fl_lmops = NULL;
-                if (fl->fl_lmops->lm_release_private)
-                        fl->fl_lmops->lm_release_private(fl);
-                fl->fl_lmops = NULL;
-        }
 }
 EXPORT_SYMBOL_GPL(locks_release_private);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index df0de27c2733..e784a217b500 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -26,6 +26,7 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
        struct completion complete;
        bio_init(&bio);
+        bio.bi_max_vecs = 1;
        bio.bi_io_vec = &bio_vec;
        bio_vec.bv_page = page;
        bio_vec.bv_len = PAGE_SIZE;
@@ -95,12 +96,11 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
        struct address_space *mapping = super->s_mapping_inode->i_mapping;
        struct bio *bio;
        struct page *page;
-        struct request_queue *q = bdev_get_queue(sb->s_bdev);
+        unsigned int max_pages;
-        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
-        if (max_pages > BIO_MAX_PAGES)
+        max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
-                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
        BUG_ON(!bio);
@@ -190,12 +190,11 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
 {
        struct logfs_super *super = logfs_super(sb);
        struct bio *bio;
-        struct request_queue *q = bdev_get_queue(sb->s_bdev);
+        unsigned int max_pages;
-        unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
        int i;
-        if (max_pages > BIO_MAX_PAGES)
+        max_pages = min(nr_pages, (size_t) bio_get_nr_vecs(super->s_bdev));
-                max_pages = BIO_MAX_PAGES;
        bio = bio_alloc(GFP_NOFS, max_pages);
        BUG_ON(!bio);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index a422f42238b2..6984562738d3 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -156,10 +156,26 @@ static void __logfs_destroy_inode(struct inode *inode)
        call_rcu(&inode->i_rcu, logfs_i_callback);
 }
+static void __logfs_destroy_meta_inode(struct inode *inode)
+{
+        struct logfs_inode *li = logfs_inode(inode);
+        BUG_ON(li->li_block);
+        call_rcu(&inode->i_rcu, logfs_i_callback);
+}
 static void logfs_destroy_inode(struct inode *inode)
 {
        struct logfs_inode *li = logfs_inode(inode);
+        if (inode->i_ino < LOGFS_RESERVED_INOS) {
+                /*
+                 * The reserved inodes are never destroyed unless we are in
+                 * unmont path.
+                 */
+                __logfs_destroy_meta_inode(inode);
+                return;
+        }
        BUG_ON(list_empty(&li->li_freeing_list));
        spin_lock(&logfs_inode_lock);
        li->li_refcount--;
@@ -373,8 +389,8 @@ static void logfs_put_super(struct super_block *sb)
 {
        struct logfs_super *super = logfs_super(sb);
        /* kill the meta-inodes */
-        iput(super->s_master_inode);
        iput(super->s_segfile_inode);
+        iput(super->s_master_inode);
        iput(super->s_mapping_inode);
 }
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 1e1c369df22b..2a09b8d73989 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -565,7 +565,7 @@ static void write_wbuf(struct super_block *sb, struct logfs_area *area,
        index = ofs >> PAGE_SHIFT;
        page_ofs = ofs & (PAGE_SIZE - 1);
-        page = find_lock_page(mapping, index);
+        page = find_or_create_page(mapping, index, GFP_NOFS);
        BUG_ON(!page);
        memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
        unlock_page(page);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index f1cb512c5019..5be0abef603d 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2189,7 +2189,6 @@ void logfs_evict_inode(struct inode *inode)
                return;
        }
-        BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
        page = inode_to_page(inode);
        BUG_ON(!page); /* FIXME: Use emergency page */
        logfs_put_write_page(page);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index e28d090c98d6..038da0991794 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -886,7 +886,7 @@ static struct logfs_area *alloc_area(struct super_block *sb)
 static void map_invalidatepage(struct page *page, unsigned long l)
 {
-        BUG();
+        return;
 }
 static int map_releasepage(struct page *page, gfp_t g)
diff --git a/fs/namei.c b/fs/namei.c
index 2ccc35c4dc24..dd1ed1b8e98e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -352,6 +352,7 @@ int __inode_permission(struct inode *inode, int mask)
 /**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
+ * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
@@ -650,6 +651,122 @@ static inline void put_link(struct nameidata *nd, struct path *link, void *cooki
        path_put(link);
 }
+int sysctl_protected_symlinks __read_mostly = 1;
+int sysctl_protected_hardlinks __read_mostly = 1;
+/**
+ * may_follow_link - Check symlink following for unsafe situations
+ * @link: The path of the symlink
+ * @nd: nameidata pathwalk data
+ *
+ * In the case of the sysctl_protected_symlinks sysctl being enabled,
+ * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
+ * in a sticky world-writable directory. This is to protect privileged
+ * processes from failing races against path names that may change out
+ * from under them by way of other users creating malicious symlinks.
+ * It will permit symlinks to be followed only when outside a sticky
+ * world-writable directory, or when the uid of the symlink and follower
+ * match, or when the directory owner matches the symlink's owner.
+ *
+ * Returns 0 if following the symlink is allowed, -ve on error.
+ */
+static inline int may_follow_link(struct path *link, struct nameidata *nd)
+{
+        const struct inode *inode;
+        const struct inode *parent;
+        if (!sysctl_protected_symlinks)
+                return 0;
+        /* Allowed if owner and follower match. */
+        inode = link->dentry->d_inode;
+        if (current_cred()->fsuid == inode->i_uid)
+                return 0;
+        /* Allowed if parent directory not sticky and world-writable. */
+        parent = nd->path.dentry->d_inode;
+        if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
+                return 0;
+        /* Allowed if parent directory and link owner match. */
+        if (parent->i_uid == inode->i_uid)
+                return 0;
+        path_put_conditional(link, nd);
+        path_put(&nd->path);
+        audit_log_link_denied("follow_link", link);
+        return -EACCES;
+}
+/**
+ * safe_hardlink_source - Check for safe hardlink conditions
+ * @inode: the source inode to hardlink from
+ *
+ * Return false if at least one of the following conditions:
+ *    - inode is not a regular file
+ *    - inode is setuid
+ *    - inode is setgid and group-exec
+ *    - access failure for read and write
+ *
+ * Otherwise returns true.
+ */
+static bool safe_hardlink_source(struct inode *inode)
+{
+        umode_t mode = inode->i_mode;
+        /* Special files should not get pinned to the filesystem. */
+        if (!S_ISREG(mode))
+                return false;
+        /* Setuid files should not get pinned to the filesystem. */
+        if (mode & S_ISUID)
+                return false;
+        /* Executable setgid files should not get pinned to the filesystem. */
+        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
+                return false;
+        /* Hardlinking to unreadable or unwritable sources is dangerous. */
+        if (inode_permission(inode, MAY_READ | MAY_WRITE))
+                return false;
+        return true;
+}
+/**
+ * may_linkat - Check permissions for creating a hardlink
+ * @link: the source to hardlink from
+ *
+ * Block hardlink when all of:
+ *  - sysctl_protected_hardlinks enabled
+ *  - fsuid does not match inode
+ *  - hardlink source is unsafe (see safe_hardlink_source() above)
+ *  - not CAP_FOWNER
+ *
+ * Returns 0 if successful, -ve on error.
+ */
+static int may_linkat(struct path *link)
+{
+        const struct cred *cred;
+        struct inode *inode;
+        if (!sysctl_protected_hardlinks)
+                return 0;
+        cred = current_cred();
+        inode = link->dentry->d_inode;
+        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
+         * otherwise, it must be a safe source.
+         */
+        if (cred->fsuid == inode->i_uid || safe_hardlink_source(inode) ||
+            capable(CAP_FOWNER))
+                return 0;
+        audit_log_link_denied("linkat", link);
+        return -EPERM;
+}
 static __always_inline int
 follow_link(struct path *link, struct nameidata *nd, void **p)
 {
@@ -1818,6 +1935,9 @@ static int path_lookupat(int dfd, const char *name,
                while (err > 0) {
                        void *cookie;
                        struct path link = path;
+                        err = may_follow_link(&link, nd);
+                        if (unlikely(err))
+                                break;
                        nd->flags |= LOOKUP_PARENT;
                        err = follow_link(&link, nd, &cookie);
                        if (err)
@@ -2277,7 +2397,7 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
 static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                        struct path *path, struct file *file,
                        const struct open_flags *op,
-                        bool *want_write, bool need_lookup,
+                        bool got_write, bool need_lookup,
                        int *opened)
 {
        struct inode *dir =  nd->path.dentry->d_inode;
@@ -2296,11 +2416,11 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                goto out;
        }
-        mode = op->mode & S_IALLUGO;
+        mode = op->mode;
        if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
                mode &= ~current_umask();
-        if (open_flag & O_EXCL) {
+        if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
                open_flag &= ~O_TRUNC;
                *opened |= FILE_CREATED;
        }
@@ -2314,12 +2434,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
-        if ((open_flag & (O_CREAT | O_TRUNC)) ||
+        if (((open_flag & (O_CREAT | O_TRUNC)) ||
-            (open_flag & O_ACCMODE) != O_RDONLY) {
+            (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
-                error = mnt_want_write(nd->path.mnt);
+                if (!(open_flag & O_CREAT)) {
-                if (!error) {
-                        *want_write = true;
-                } else if (!(open_flag & O_CREAT)) {
                        /*
                         * No O_CREATE -> atomicity not a requirement -> fall
                         * back to lookup + open
@@ -2327,17 +2444,17 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                        goto no_open;
                } else if (open_flag & (O_EXCL | O_TRUNC)) {
                        /* Fall back and fail with the right error */
-                        create_error = error;
+                        create_error = -EROFS;
                        goto no_open;
                } else {
                        /* No side effects, safe to clear O_CREAT */
-                        create_error = error;
+                        create_error = -EROFS;
                        open_flag &= ~O_CREAT;
                }
        }
        if (open_flag & O_CREAT) {
-                error = may_o_create(&nd->path, dentry, op->mode);
+                error = may_o_create(&nd->path, dentry, mode);
                if (error) {
                        create_error = error;
                        if (open_flag & O_EXCL)
@@ -2374,6 +2491,10 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
                        dput(dentry);
                        dentry = file->f_path.dentry;
                }
+                if (create_error && dentry->d_inode == NULL) {
+                        error = create_error;
+                        goto out;
+                }
                goto looked_up;
        }
@@ -2438,7 +2559,7 @@ looked_up:
 static int lookup_open(struct nameidata *nd, struct path *path,
                        struct file *file,
                        const struct open_flags *op,
-                        bool *want_write, int *opened)
+                        bool got_write, int *opened)
 {
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
@@ -2456,7 +2577,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
                goto out_no_open;
        if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
-                return atomic_open(nd, dentry, path, file, op, want_write,
+                return atomic_open(nd, dentry, path, file, op, got_write,
                                   need_lookup, opened);
        }
@@ -2480,10 +2601,10 @@ static int lookup_open(struct nameidata *nd, struct path *path,
                 * a permanent write count is taken through
                 * the 'struct file' in finish_open().
                 */
-                error = mnt_want_write(nd->path.mnt);
+                if (!got_write) {
-                if (error)
+                        error = -EROFS;
                        goto out_dput;
-                *want_write = true;
+                }
                *opened |= FILE_CREATED;
                error = security_path_mknod(&nd->path, dentry, mode, 0);
                if (error)
@@ -2513,7 +2634,7 @@ static int do_last(struct nameidata *nd, struct path *path,
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool will_truncate = (open_flag & O_TRUNC) != 0;
-        bool want_write = false;
+        bool got_write = false;
        int acc_mode = op->acc_mode;
        struct inode *inode;
        bool symlink_ok = false;
@@ -2582,8 +2703,18 @@ static int do_last(struct nameidata *nd, struct path *path,
        }
 retry_lookup:
+        if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+                error = mnt_want_write(nd->path.mnt);
+                if (!error)
+                        got_write = true;
+                /*
+                 * do _not_ fail yet - we might not need that or fail with
+                 * a different error; let lookup_open() decide; we'll be
+                 * dropping this one anyway.
+                 */
+        }
        mutex_lock(&dir->d_inode->i_mutex);
-        error = lookup_open(nd, path, file, op, &want_write, opened);
+        error = lookup_open(nd, path, file, op, got_write, opened);
        mutex_unlock(&dir->d_inode->i_mutex);
        if (error <= 0) {
@@ -2608,22 +2739,23 @@ retry_lookup:
        }
        /*
-         * It already exists.
+         * create/update audit record if it already exists.
         */
-        audit_inode(pathname, path->dentry);
+        if (path->dentry->d_inode)
+                audit_inode(pathname, path->dentry);
        /*
         * If atomic_open() acquired write access it is dropped now due to
         * possible mount and symlink following (this might be optimized away if
         * necessary...)
         */
-        if (want_write) {
+        if (got_write) {
                mnt_drop_write(nd->path.mnt);
-                want_write = false;
+                got_write = false;
        }
        error = -EEXIST;
-        if (open_flag & O_EXCL)
+        if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))
                goto exit_dput;
        error = follow_managed(path, nd->flags);
@@ -2684,7 +2816,7 @@ finish_open:
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        goto out;
-                want_write = true;
+                got_write = true;
        }
 finish_open_created:
        error = may_open(&nd->path, acc_mode, open_flag);
@@ -2711,7 +2843,7 @@ opened:
                        goto exit_fput;
        }
 out:
-        if (want_write)
+        if (got_write)
                mnt_drop_write(nd->path.mnt);
        path_put(&save_parent);
        terminate_walk(nd);
@@ -2735,9 +2867,9 @@ stale_open:
        nd->inode = dir->d_inode;
        save_parent.mnt = NULL;
        save_parent.dentry = NULL;
-        if (want_write) {
+        if (got_write) {
                mnt_drop_write(nd->path.mnt);
-                want_write = false;
+                got_write = false;
        }
        retried = true;
        goto retry_lookup;
@@ -2777,6 +2909,9 @@ static struct file *path_openat(int dfd, const char *pathname,
                        error = -ELOOP;
                        break;
                }
+                error = may_follow_link(&link, nd);
+                if (unlikely(error))
+                        break;
                nd->flags |= LOOKUP_PARENT;
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
                error = follow_link(&link, nd, &cookie);
@@ -2846,6 +2981,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
 {
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct nameidata nd;
+        int err2;
        int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
        if (error)
                return ERR_PTR(error);
@@ -2859,16 +2995,19 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
        nd.flags &= ~LOOKUP_PARENT;
        nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL;
+        /* don't fail immediately if it's r/o, at least try to report other errors */
+        err2 = mnt_want_write(nd.path.mnt);
        /*
         * Do the final lookup.
         */
        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_hash(&nd);
        if (IS_ERR(dentry))
-                goto fail;
+                goto unlock;
+        error = -EEXIST;
        if (dentry->d_inode)
-                goto eexist;
+                goto fail;
        /*
         * Special case - lookup gave negative, but... we had foo/bar/
         * From the vfs_mknod() POV we just have a negative dentry -
@@ -2876,23 +3015,37 @@ struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path
         * been asking for (non-existent) directory. -ENOENT for you.
         */
        if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
-                dput(dentry);
+                error = -ENOENT;
-                dentry = ERR_PTR(-ENOENT);
+                goto fail;
+        }
+        if (unlikely(err2)) {
+                error = err2;
                goto fail;
        }
        *path = nd.path;
        return dentry;
-eexist:
-        dput(dentry);
-        dentry = ERR_PTR(-EEXIST);
 fail:
+        dput(dentry);
+        dentry = ERR_PTR(error);
+unlock:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+        if (!err2)
+                mnt_drop_write(nd.path.mnt);
 out:
        path_put(&nd.path);
        return dentry;
 }
 EXPORT_SYMBOL(kern_path_create);
+void done_path_create(struct path *path, struct dentry *dentry)
+{
+        dput(dentry);
+        mutex_unlock(&path->dentry->d_inode->i_mutex);
+        mnt_drop_write(path->mnt);
+        path_put(path);
+}
+EXPORT_SYMBOL(done_path_create);
 struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir)
 {
        char *tmp = getname(pathname);
@@ -2956,8 +3109,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
        struct path path;
        int error;
-        if (S_ISDIR(mode))
+        error = may_mknod(mode);
-                return -EPERM;
+        if (error)
+                return error;
        dentry = user_path_create(dfd, filename, &path, 0);
        if (IS_ERR(dentry))
@@ -2965,15 +3119,9 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
        if (!IS_POSIXACL(path.dentry->d_inode))
                mode &= ~current_umask();
-        error = may_mknod(mode);
-        if (error)
-                goto out_dput;
-        error = mnt_want_write(path.mnt);
-        if (error)
-                goto out_dput;
        error = security_path_mknod(&path, dentry, mode, dev);
        if (error)
-                goto out_drop_write;
+                goto out;
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(path.dentry->d_inode,dentry,mode,true);
@@ -2986,13 +3134,8 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                        error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
                        break;
        }
-out_drop_write:
+out:
-        mnt_drop_write(path.mnt);
+        done_path_create(&path, dentry);
-out_dput:
-        dput(dentry);
-        mutex_unlock(&path.dentry->d_inode->i_mutex);
-        path_put(&path);
        return error;
 }
@@ -3038,19 +3181,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
        if (!IS_POSIXACL(path.dentry->d_inode))
                mode &= ~current_umask();
-        error = mnt_want_write(path.mnt);
-        if (error)
-                goto out_dput;
        error = security_path_mkdir(&path, dentry, mode);
-        if (error)
+        if (!error)
-                goto out_drop_write;
+                error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
-        error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+        done_path_create(&path, dentry);
-out_drop_write:
-        mnt_drop_write(path.mnt);
-out_dput:
-        dput(dentry);
-        mutex_unlock(&path.dentry->d_inode->i_mutex);
-        path_put(&path);
        return error;
 }
@@ -3144,6 +3278,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
        }
        nd.flags &= ~LOOKUP_PARENT;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto exit1;
        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_hash(&nd);
@@ -3154,19 +3291,15 @@ static long do_rmdir(int dfd, const char __user *pathname)
                error = -ENOENT;
                goto exit3;
        }
-        error = mnt_want_write(nd.path.mnt);
-        if (error)
-                goto exit3;
        error = security_path_rmdir(&nd.path, dentry);
        if (error)
-                goto exit4;
+                goto exit3;
        error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
-exit4:
-        mnt_drop_write(nd.path.mnt);
 exit3:
        dput(dentry);
 exit2:
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+        mnt_drop_write(nd.path.mnt);
 exit1:
        path_put(&nd.path);
        putname(name);
@@ -3233,6 +3366,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                goto exit1;
        nd.flags &= ~LOOKUP_PARENT;
+        error = mnt_want_write(nd.path.mnt);
+        if (error)
+                goto exit1;
        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        dentry = lookup_hash(&nd);
@@ -3245,21 +3381,17 @@ static long do_unlinkat(int dfd, const char __user *pathname)
                if (!inode)
                        goto slashes;
                ihold(inode);
-                error = mnt_want_write(nd.path.mnt);
-                if (error)
-                        goto exit2;
                error = security_path_unlink(&nd.path, dentry);
                if (error)
-                        goto exit3;
+                        goto exit2;
                error = vfs_unlink(nd.path.dentry->d_inode, dentry);
-exit3:
+exit2:
-                mnt_drop_write(nd.path.mnt);
-        exit2:
                dput(dentry);
        }
        mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
        if (inode)
                iput(inode);    /* truncate the inode here */
+        mnt_drop_write(nd.path.mnt);
 exit1:
        path_put(&nd.path);
        putname(name);
@@ -3324,19 +3456,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
        if (IS_ERR(dentry))
                goto out_putname;
-        error = mnt_want_write(path.mnt);
-        if (error)
-                goto out_dput;
        error = security_path_symlink(&path, dentry, from);
-        if (error)
+        if (!error)
-                goto out_drop_write;
+                error = vfs_symlink(path.dentry->d_inode, dentry, from);
-        error = vfs_symlink(path.dentry->d_inode, dentry, from);
+        done_path_create(&path, dentry);
-out_drop_write:
-        mnt_drop_write(path.mnt);
-out_dput:
-        dput(dentry);
-        mutex_unlock(&path.dentry->d_inode->i_mutex);
-        path_put(&path);
 out_putname:
        putname(from);
        return error;
@@ -3436,19 +3559,15 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
-        error = mnt_want_write(new_path.mnt);
+        error = may_linkat(&old_path);
-        if (error)
+        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
-                goto out_drop_write;
+                goto out_dput;
        error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
-out_drop_write:
-        mnt_drop_write(new_path.mnt);
 out_dput:
-        dput(new_dentry);
+        done_path_create(&new_path, new_dentry);
-        mutex_unlock(&new_path.dentry->d_inode->i_mutex);
-        path_put(&new_path);
 out:
        path_put(&old_path);
@@ -3644,6 +3763,10 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
        if (newnd.last_type != LAST_NORM)
                goto exit2;
+        error = mnt_want_write(oldnd.path.mnt);
+        if (error)
+                goto exit2;
        oldnd.flags &= ~LOOKUP_PARENT;
        newnd.flags &= ~LOOKUP_PARENT;
        newnd.flags |= LOOKUP_RENAME_TARGET;
@@ -3679,23 +3802,19 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
        if (new_dentry == trap)
                goto exit5;
-        error = mnt_want_write(oldnd.path.mnt);
-        if (error)
-                goto exit5;
        error = security_path_rename(&oldnd.path, old_dentry,
                                     &newnd.path, new_dentry);
        if (error)
-                goto exit6;
+                goto exit5;
        error = vfs_rename(old_dir->d_inode, old_dentry,
                                   new_dir->d_inode, new_dentry);
-exit6:
-        mnt_drop_write(oldnd.path.mnt);
 exit5:
        dput(new_dentry);
 exit4:
        dput(old_dentry);
 exit3:
        unlock_rename(new_dir, old_dir);
+        mnt_drop_write(oldnd.path.mnt);
 exit2:
        path_put(&newnd.path);
        putname(to);
diff --git a/fs/namespace.c b/fs/namespace.c
index c53d3381b0d0..4d31f73e2561 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -283,24 +283,22 @@ static int mnt_is_readonly(struct vfsmount *mnt)
 }
 /*
- * Most r/o checks on a fs are for operations that take
+ * Most r/o & frozen checks on a fs are for operations that take discrete
- * discrete amounts of time, like a write() or unlink().
+ * amounts of time, like a write() or unlink().  We must keep track of when
- * We must keep track of when those operations start
+ * those operations start (for permission checks) and when they end, so that we
- * (for permission checks) and when they end, so that
+ * can determine when writes are able to occur to a filesystem.
- * we can determine when writes are able to occur to
- * a filesystem.
 */
 /**
- * mnt_want_write - get write access to a mount
+ * __mnt_want_write - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
- * This tells the low-level filesystem that a write is
+ * This tells the low-level filesystem that a write is about to be performed to
- * about to be performed to it, and makes sure that
+ * it, and makes sure that writes are allowed (mnt it read-write) before
- * writes are allowed before returning success.  When
+ * returning success. This operation does not protect against filesystem being
- * the write operation is finished, mnt_drop_write()
+ * frozen. When the write operation is finished, __mnt_drop_write() must be
- * must be called.  This is effectively a refcount.
+ * called. This is effectively a refcount.
 */
-int mnt_want_write(struct vfsmount *m)
+int __mnt_want_write(struct vfsmount *m)
 {
        struct mount *mnt = real_mount(m);
        int ret = 0;
@@ -326,6 +324,27 @@ int mnt_want_write(struct vfsmount *m)
                ret = -EROFS;
        }
        preempt_enable();
+        return ret;
+}
+/**
+ * mnt_want_write - get write access to a mount
+ * @m: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mount is read-write, filesystem
+ * is not frozen) before returning success.  When the write operation is
+ * finished, mnt_drop_write() must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *m)
+{
+        int ret;
+        sb_start_write(m->mnt_sb);
+        ret = __mnt_want_write(m);
+        if (ret)
+                sb_end_write(m->mnt_sb);
        return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -355,38 +374,76 @@ int mnt_clone_write(struct vfsmount *mnt)
 EXPORT_SYMBOL_GPL(mnt_clone_write);
 /**
- * mnt_want_write_file - get write access to a file's mount
+ * __mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
- * This is like mnt_want_write, but it takes a file and can
+ * This is like __mnt_want_write, but it takes a file and can
 * do some optimisations if the file is open for write already
 */
-int mnt_want_write_file(struct file *file)
+int __mnt_want_write_file(struct file *file)
 {
        struct inode *inode = file->f_dentry->d_inode;
        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
-                return mnt_want_write(file->f_path.mnt);
+                return __mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
 }
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+        int ret;
+        sb_start_write(file->f_path.mnt->mnt_sb);
+        ret = __mnt_want_write_file(file);
+        if (ret)
+                sb_end_write(file->f_path.mnt->mnt_sb);
+        return ret;
+}
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 /**
- * mnt_drop_write - give up write access to a mount
+ * __mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
- * mnt_want_write() call above.
+ * __mnt_want_write() call above.
 */
-void mnt_drop_write(struct vfsmount *mnt)
+void __mnt_drop_write(struct vfsmount *mnt)
 {
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
 }
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done performing writes to it and
+ * also allows filesystem to be frozen again.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+        __mnt_drop_write(mnt);
+        sb_end_write(mnt->mnt_sb);
+}
 EXPORT_SYMBOL_GPL(mnt_drop_write);
+void __mnt_drop_write_file(struct file *file)
+{
+        __mnt_drop_write(file->f_path.mnt);
+}
 void mnt_drop_write_file(struct file *file)
 {
        mnt_drop_write(file->f_path.mnt);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cbaf4f8bb7b7..4c7bd35b1876 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -651,12 +651,12 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
        if (clp->cl_minorversion == 0) {
                if (!clp->cl_cred.cr_principal &&
-                                (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+                                (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
                        return -EINVAL;
                args.client_name = clp->cl_cred.cr_principal;
                args.prognumber = conn->cb_prog,
                args.protocol = XPRT_TRANSPORT_TCP;
-                args.authflavor = clp->cl_flavor;
+                args.authflavor = clp->cl_cred.cr_flavor;
                clp->cl_cb_ident = conn->cb_ident;
        } else {
                if (!conn->cb_xprt)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 5ff0b7b9fc08..43295d45cc2b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -154,6 +154,10 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
        if (status < 0)
                return;
+        status = mnt_want_write_file(rec_file);
+        if (status)
+                return;
        dir = rec_file->f_path.dentry;
        /* lock the parent */
        mutex_lock(&dir->d_inode->i_mutex);
@@ -173,11 +177,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
                 * as well be forgiving and just succeed silently.
                 */
                goto out_put;
-        status = mnt_want_write_file(rec_file);
-        if (status)
-                goto out_put;
        status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
-        mnt_drop_write_file(rec_file);
 out_put:
        dput(dentry);
 out_unlock:
@@ -189,6 +189,7 @@ out_unlock:
                                " (err %d); please check that %s exists"
                                " and is writeable", status,
                                user_recovery_dirname);
+        mnt_drop_write_file(rec_file);
        nfs4_reset_creds(original_cred);
 }
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index cc793005a87c..032af381b3aa 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -635,6 +635,7 @@ fh_put(struct svc_fh *fhp)
                fhp->fh_post_saved = 0;
 #endif
        }
+        fh_drop_write(fhp);
        if (exp) {
                exp_put(exp);
                fhp->fh_export = NULL;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index e15dc45fc5ec..aad6d457b9e8 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -196,6 +196,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
        struct dentry   *dchild;
        int             type, mode;
        __be32          nfserr;
+        int             hosterr;
        dev_t           rdev = 0, wanted = new_decode_dev(attr->ia_size);
        dprintk("nfsd: CREATE   %s %.*s\n",
@@ -214,6 +215,12 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
        nfserr = nfserr_exist;
        if (isdotent(argp->name, argp->len))
                goto done;
+        hosterr = fh_want_write(dirfhp);
+        if (hosterr) {
+                nfserr = nfserrno(hosterr);
+                goto done;
+        }
        fh_lock_nested(dirfhp, I_MUTEX_PARENT);
        dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
        if (IS_ERR(dchild)) {
@@ -330,7 +337,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
 out_unlock:
        /* We don't really need to unlock, as fh_put does it. */
        fh_unlock(dirfhp);
+        fh_drop_write(dirfhp);
 done:
        fh_put(dirfhp);
        return nfsd_return_dirop(nfserr, resp);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e6173147f982..22bd0a66c356 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -231,7 +231,6 @@ struct nfs4_client {
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
        struct sockaddr_storage cl_addr;        /* client ipaddress */
-        u32                     cl_flavor;      /* setclientid pseudoflavor */
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 702f64e820c3..a9269f142cc4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1284,6 +1284,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
         * If it has, the parent directory should already be locked.
         */
        if (!resfhp->fh_dentry) {
+                host_err = fh_want_write(fhp);
+                if (host_err)
+                        goto out_nfserr;
                /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
                fh_lock_nested(fhp, I_MUTEX_PARENT);
                dchild = lookup_one_len(fname, dentry, flen);
@@ -1327,14 +1331,11 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                goto out;
        }
-        host_err = fh_want_write(fhp);
-        if (host_err)
-                goto out_nfserr;
        /*
         * Get the dir op function pointer.
         */
        err = 0;
+        host_err = 0;
        switch (type) {
        case S_IFREG:
                host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
@@ -1351,10 +1352,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
                break;
        }
-        if (host_err < 0) {
+        if (host_err < 0)
-                fh_drop_write(fhp);
                goto out_nfserr;
-        }
        err = nfsd_create_setattr(rqstp, resfhp, iap);
@@ -1366,7 +1365,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err2 = nfserrno(commit_metadata(fhp));
        if (err2)
                err = err2;
-        fh_drop_write(fhp);
        /*
         * Update the file handle to get the new inode info.
         */
@@ -1425,6 +1423,11 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = nfserr_notdir;
        if (!dirp->i_op->lookup)
                goto out;
+        host_err = fh_want_write(fhp);
+        if (host_err)
+                goto out_nfserr;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
        /*
@@ -1457,9 +1460,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
                v_atime = verifier[1]&0x7fffffff;
        }
        
-        host_err = fh_want_write(fhp);
-        if (host_err)
-                goto out_nfserr;
        if (dchild->d_inode) {
                err = 0;
@@ -1530,7 +1530,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (!err)
                err = nfserrno(commit_metadata(fhp));
-        fh_drop_write(fhp);
        /*
         * Update the filehandle to get the new inode info.
         */
@@ -1541,6 +1540,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
        fh_unlock(fhp);
        if (dchild && !IS_ERR(dchild))
                dput(dchild);
+        fh_drop_write(fhp);
        return err;
 
 out_nfserr:
@@ -1621,6 +1621,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
        if (err)
                goto out;
+        host_err = fh_want_write(fhp);
+        if (host_err)
+                goto out_nfserr;
        fh_lock(fhp);
        dentry = fhp->fh_dentry;
        dnew = lookup_one_len(fname, dentry, flen);
@@ -1628,10 +1633,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
        if (IS_ERR(dnew))
                goto out_nfserr;
-        host_err = fh_want_write(fhp);
-        if (host_err)
-                goto out_nfserr;
        if (unlikely(path[plen] != 0)) {
                char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
                if (path_alloced == NULL)
@@ -1691,6 +1692,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        if (isdotent(name, len))
                goto out;
+        host_err = fh_want_write(tfhp);
+        if (host_err) {
+                err = nfserrno(host_err);
+                goto out;
+        }
        fh_lock_nested(ffhp, I_MUTEX_PARENT);
        ddir = ffhp->fh_dentry;
        dirp = ddir->d_inode;
@@ -1702,18 +1709,13 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
        dold = tfhp->fh_dentry;
-        host_err = fh_want_write(tfhp);
-        if (host_err) {
-                err = nfserrno(host_err);
-                goto out_dput;
-        }
        err = nfserr_noent;
        if (!dold->d_inode)
-                goto out_drop_write;
+                goto out_dput;
        host_err = nfsd_break_lease(dold->d_inode);
        if (host_err) {
                err = nfserrno(host_err);
-                goto out_drop_write;
+                goto out_dput;
        }
        host_err = vfs_link(dold, dirp, dnew);
        if (!host_err) {
@@ -1726,12 +1728,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
                else
                        err = nfserrno(host_err);
        }
-out_drop_write:
-        fh_drop_write(tfhp);
 out_dput:
        dput(dnew);
 out_unlock:
        fh_unlock(ffhp);
+        fh_drop_write(tfhp);
 out:
        return err;
@@ -1774,6 +1775,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
                goto out;
+        host_err = fh_want_write(ffhp);
+        if (host_err) {
+                err = nfserrno(host_err);
+                goto out;
+        }
        /* cannot use fh_lock as we need deadlock protective ordering
         * so do it by hand */
        trap = lock_rename(tdentry, fdentry);
@@ -1804,17 +1811,14 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
        host_err = -EXDEV;
        if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
                goto out_dput_new;
-        host_err = fh_want_write(ffhp);
-        if (host_err)
-                goto out_dput_new;
        host_err = nfsd_break_lease(odentry->d_inode);
        if (host_err)
-                goto out_drop_write;
+                goto out_dput_new;
        if (ndentry->d_inode) {
                host_err = nfsd_break_lease(ndentry->d_inode);
                if (host_err)
-                        goto out_drop_write;
+                        goto out_dput_new;
        }
        host_err = vfs_rename(fdir, odentry, tdir, ndentry);
        if (!host_err) {
@@ -1822,8 +1826,6 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
                if (!host_err)
                        host_err = commit_metadata(ffhp);
        }
-out_drop_write:
-        fh_drop_write(ffhp);
 out_dput_new:
        dput(ndentry);
 out_dput_old:
@@ -1839,6 +1841,7 @@ out_drop_write:
        fill_post_wcc(tfhp);
        unlock_rename(tdentry, fdentry);
        ffhp->fh_locked = tfhp->fh_locked = 0;
+        fh_drop_write(ffhp);
 out:
        return err;
@@ -1864,6 +1867,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (err)
                goto out;
+        host_err = fh_want_write(fhp);
+        if (host_err)
+                goto out_nfserr;
        fh_lock_nested(fhp, I_MUTEX_PARENT);
        dentry = fhp->fh_dentry;
        dirp = dentry->d_inode;
@@ -1882,21 +1889,15 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        if (!type)
                type = rdentry->d_inode->i_mode & S_IFMT;
-        host_err = fh_want_write(fhp);
-        if (host_err)
-                goto out_put;
        host_err = nfsd_break_lease(rdentry->d_inode);
        if (host_err)
-                goto out_drop_write;
+                goto out_put;
        if (type != S_IFDIR)
                host_err = vfs_unlink(dirp, rdentry);
        else
                host_err = vfs_rmdir(dirp, rdentry);
        if (!host_err)
                host_err = commit_metadata(fhp);
-out_drop_write:
-        fh_drop_write(fhp);
 out_put:
        dput(rdentry);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index ec0611b2b738..359594c393d2 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -110,12 +110,19 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
 static inline int fh_want_write(struct svc_fh *fh)
 {
-        return mnt_want_write(fh->fh_export->ex_path.mnt);
+        int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
+        if (!ret)
+                fh->fh_want_write = 1;
+        return ret;
 }
 static inline void fh_drop_write(struct svc_fh *fh)
 {
-        mnt_drop_write(fh->fh_export->ex_path.mnt);
+        if (fh->fh_want_write) {
+                fh->fh_want_write = 0;
+                mnt_drop_write(fh->fh_export->ex_path.mnt);
+        }
 }
 #endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 62cebc8e1a1f..a4d56ac02e6c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -69,16 +69,18 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct nilfs_transaction_info ti;
-        int ret;
+        int ret = 0;
        if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
                return VM_FAULT_SIGBUS; /* -ENOSPC */
+        sb_start_pagefault(inode->i_sb);
        lock_page(page);
        if (page->mapping != inode->i_mapping ||
            page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
                unlock_page(page);
-                return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+                ret = -EFAULT;  /* make the VM retry the fault */
+                goto out;
        }
        /*
@@ -112,19 +114,21 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
        /* never returns -ENOMEM, but may return -ENOSPC */
        if (unlikely(ret))
-                return VM_FAULT_SIGBUS;
+                goto out;
-        ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
+        ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
-        if (ret != VM_FAULT_LOCKED) {
+        if (ret) {
                nilfs_transaction_abort(inode->i_sb);
-                return ret;
+                goto out;
        }
        nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
        nilfs_transaction_commit(inode->i_sb);
 mapped:
        wait_on_page_writeback(page);
-        return VM_FAULT_LOCKED;
+ out:
+        sb_end_pagefault(inode->i_sb);
+        return block_page_mkwrite_return(ret);
 }
 static const struct vm_operations_struct nilfs_file_vm_ops = {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 0b6387c67e6c..fdb180769485 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -660,8 +660,6 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                goto out_free;
        }
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
        if (ret < 0)
                printk(KERN_ERR "NILFS: GC failed during preparation: "
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 88e11fb346b6..a5752a589932 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -189,7 +189,7 @@ int nilfs_transaction_begin(struct super_block *sb,
        if (ret > 0)
                return 0;
-        vfs_check_frozen(sb, SB_FREEZE_WRITE);
+        sb_start_intwrite(sb);
        nilfs = sb->s_fs_info;
        down_read(&nilfs->ns_segctor_sem);
@@ -205,6 +205,7 @@ int nilfs_transaction_begin(struct super_block *sb,
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
+        sb_end_intwrite(sb);
        return ret;
 }
@@ -246,6 +247,7 @@ int nilfs_transaction_commit(struct super_block *sb)
                err = nilfs_construct_segment(sb);
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
+        sb_end_intwrite(sb);
        return err;
 }
@@ -264,6 +266,7 @@ void nilfs_transaction_abort(struct super_block *sb)
        current->journal_info = ti->ti_save;
        if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
                kmem_cache_free(nilfs_transaction_cachep, ti);
+        sb_end_intwrite(sb);
 }
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6522cac6057c..6a10812711c1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -676,17 +676,13 @@ static const struct super_operations nilfs_sops = {
        .alloc_inode    = nilfs_alloc_inode,
        .destroy_inode  = nilfs_destroy_inode,
        .dirty_inode    = nilfs_dirty_inode,
-        /* .write_inode    = nilfs_write_inode, */
-        /* .drop_inode    = nilfs_drop_inode, */
        .evict_inode    = nilfs_evict_inode,
        .put_super      = nilfs_put_super,
-        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
        .freeze_fs      = nilfs_freeze,
        .unfreeze_fs    = nilfs_unfreeze,
        .statfs         = nilfs_statfs,
        .remount_fs     = nilfs_remount,
-        /* .umount_begin */
        .show_options = nilfs_show_options
 };
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 6eee4177807b..be1267a34cea 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -107,8 +107,6 @@ struct the_nilfs {
         * used for
         * - loading the latest checkpoint exclusively.
         * - allocating a new full segment.
-         * - protecting s_dirt in the super_block struct
-         *   (see nilfs_write_super) and the following fields.
         */
        struct buffer_head     *ns_sbh[2];
        struct nilfs_super_block *ns_sbp[2];
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 7389d2d5e51d..1ecf46448f85 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2084,7 +2084,6 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
        if (err)
                return err;
        pos = *ppos;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim. */
        current->backing_dev_info = mapping->backing_dev_info;
        written = 0;
@@ -2119,6 +2118,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        BUG_ON(iocb->ki_pos != pos);
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2127,6 +2127,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0)
                        ret = err;
        }
+        sb_end_write(inode->i_sb);
        return ret;
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 7602783d7f41..46a1f6d75104 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1971,6 +1971,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int ret;
        if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
            !ocfs2_writes_unwritten_extents(osb))
@@ -1985,7 +1986,12 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
-        return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+        ret = mnt_want_write_file(file);
+        if (ret)
+                return ret;
+        ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+        mnt_drop_write_file(file);
+        return ret;
 }
 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
@@ -2261,7 +2267,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
        if (iocb->ki_left == 0)
                return 0;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+        sb_start_write(inode->i_sb);
        appending = file->f_flags & O_APPEND ? 1 : 0;
        direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2436,6 +2442,7 @@ out_sems:
                ocfs2_iocb_clear_sem_locked(iocb);
        mutex_unlock(&inode->i_mutex);
+        sb_end_write(inode->i_sb);
        if (written)
                ret = written;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index d96f7f81d8dd..f20edcbfe700 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -928,7 +928,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (get_user(new_clusters, (int __user *)arg))
                        return -EFAULT;
-                return ocfs2_group_extend(inode, new_clusters);
+                status = mnt_want_write_file(filp);
+                if (status)
+                        return status;
+                status = ocfs2_group_extend(inode, new_clusters);
+                mnt_drop_write_file(filp);
+                return status;
        case OCFS2_IOC_GROUP_ADD:
        case OCFS2_IOC_GROUP_ADD64:
                if (!capable(CAP_SYS_RESOURCE))
@@ -937,7 +942,12 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
                        return -EFAULT;
-                return ocfs2_group_add(inode, &input);
+                status = mnt_want_write_file(filp);
+                if (status)
+                        return status;
+                status = ocfs2_group_add(inode, &input);
+                mnt_drop_write_file(filp);
+                return status;
        case OCFS2_IOC_REFLINK:
                if (copy_from_user(&args, argp, sizeof(args)))
                        return -EFAULT;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 0a42ae96dca7..2dd36af79e26 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -355,11 +355,14 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
        if (journal_current_handle())
                return jbd2_journal_start(journal, max_buffs);
+        sb_start_intwrite(osb->sb);
        down_read(&osb->journal->j_trans_barrier);
        handle = jbd2_journal_start(journal, max_buffs);
        if (IS_ERR(handle)) {
                up_read(&osb->journal->j_trans_barrier);
+                sb_end_intwrite(osb->sb);
                mlog_errno(PTR_ERR(handle));
@@ -388,8 +391,10 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
        if (ret < 0)
                mlog_errno(ret);
-        if (!nested)
+        if (!nested) {
                up_read(&journal->j_trans_barrier);
+                sb_end_intwrite(osb->sb);
+        }
        return ret;
 }
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9cd41083e991..d150372fd81d 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -136,6 +136,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        sigset_t oldset;
        int ret;
+        sb_start_pagefault(inode->i_sb);
        ocfs2_block_signals(&oldset);
        /*
@@ -165,6 +166,7 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 out:
        ocfs2_unblock_signals(&oldset);
+        sb_end_pagefault(inode->i_sb);
        return ret;
 }
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 9f32d7cbb7a3..30a055049e16 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4466,20 +4466,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
                goto out_dput;
        }
-        error = mnt_want_write(new_path.mnt);
-        if (error) {
-                mlog_errno(error);
-                goto out_dput;
-        }
        error = ocfs2_vfs_reflink(old_path.dentry,
                                  new_path.dentry->d_inode,
                                  new_dentry, preserve);
-        mnt_drop_write(new_path.mnt);
 out_dput:
-        dput(new_dentry);
+        done_path_create(&new_path, new_dentry);
-        mutex_unlock(&new_path.dentry->d_inode->i_mutex);
-        path_put(&new_path);
 out:
        path_put(&old_path);
diff --git a/fs/open.c b/fs/open.c
index 1e914b397e12..e1f2cdb91a4d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -164,11 +164,13 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
        if (IS_APPEND(inode))
                goto out_putf;
+        sb_start_write(inode->i_sb);
        error = locks_verify_truncate(inode, file, length);
        if (!error)
                error = security_path_truncate(&file->f_path);
        if (!error)
                error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
+        sb_end_write(inode->i_sb);
 out_putf:
        fput(file);
 out:
@@ -266,7 +268,10 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;
-        return file->f_op->fallocate(file, mode, offset, len);
+        sb_start_write(inode->i_sb);
+        ret = file->f_op->fallocate(file, mode, offset, len);
+        sb_end_write(inode->i_sb);
+        return ret;
 }
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -620,7 +625,7 @@ static inline int __get_file_write_access(struct inode *inode,
                /*
                 * Balanced in __fput()
                 */
-                error = mnt_want_write(mnt);
+                error = __mnt_want_write(mnt);
                if (error)
                        put_write_access(inode);
        }
@@ -654,6 +659,7 @@ static int do_dentry_open(struct file *f,
        if (unlikely(f->f_flags & O_PATH))
                f->f_mode = FMODE_PATH;
+        path_get(&f->f_path);
        inode = f->f_path.dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
                error = __get_file_write_access(inode, f->f_path.mnt);
@@ -711,7 +717,7 @@ cleanup_all:
                         * here, so just reset the state.
                         */
                        file_reset_write(f);
-                        mnt_drop_write(f->f_path.mnt);
+                        __mnt_drop_write(f->f_path.mnt);
                }
        }
 cleanup_file:
@@ -739,9 +745,7 @@ int finish_open(struct file *file, struct dentry *dentry,
        int error;
        BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
-        mntget(file->f_path.mnt);
+        file->f_path.dentry = dentry;
-        file->f_path.dentry = dget(dentry);
        error = do_dentry_open(file, open, current_cred());
        if (!error)
                *opened |= FILE_OPENED;
@@ -784,7 +788,6 @@ struct file *dentry_open(const struct path *path, int flags,
        f->f_flags = flags;
        f->f_path = *path;
-        path_get(&f->f_path);
        error = do_dentry_open(f, NULL, cred);
        if (!error) {
                error = open_check_o_direct(f);
@@ -849,9 +852,10 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
        int lookup_flags = 0;
        int acc_mode;
-        if (!(flags & O_CREAT))
+        if (flags & O_CREAT)
-                mode = 0;
+                op->mode = (mode & S_IALLUGO) | S_IFREG;
-        op->mode = mode;
+        else
+                op->mode = 0;
        /* Must never be set by userspace */
        flags &= ~FMODE_NONOTIFY;
diff --git a/fs/pipe.c b/fs/pipe.c
index 95cbd6b227e6..8d85d7068c1e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,18 +1016,16 @@ fail_inode:
        return NULL;
 }
-struct file *create_write_pipe(int flags)
+int create_pipe_files(struct file **res, int flags)
 {
        int err;
-        struct inode *inode;
+        struct inode *inode = get_pipe_inode();
        struct file *f;
        struct path path;
-        struct qstr name = { .name = "" };
+        static struct qstr name = { .name = "" };
-        err = -ENFILE;
-        inode = get_pipe_inode();
        if (!inode)
-                goto err;
+                return -ENFILE;
        err = -ENOMEM;
        path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name);
@@ -1041,62 +1039,43 @@ struct file *create_write_pipe(int flags)
        f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
        if (!f)
                goto err_dentry;
-        f->f_mapping = inode->i_mapping;
        f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
-        f->f_version = 0;
-        return f;
+        res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
+        if (!res[0])
+                goto err_file;
+        path_get(&path);
+        res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+        res[1] = f;
+        return 0;
- err_dentry:
+err_file:
+        put_filp(f);
+err_dentry:
        free_pipe_info(inode);
        path_put(&path);
-        return ERR_PTR(err);
+        return err;
- err_inode:
+err_inode:
        free_pipe_info(inode);
        iput(inode);
- err:
+        return err;
-        return ERR_PTR(err);
-}
-void free_write_pipe(struct file *f)
-{
-        free_pipe_info(f->f_dentry->d_inode);
-        path_put(&f->f_path);
-        put_filp(f);
-}
-struct file *create_read_pipe(struct file *wrf, int flags)
-{
-        /* Grab pipe from the writer */
-        struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
-                                    &read_pipefifo_fops);
-        if (!f)
-                return ERR_PTR(-ENFILE);
-        path_get(&wrf->f_path);
-        f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        return f;
 }
 int do_pipe_flags(int *fd, int flags)
 {
-        struct file *fw, *fr;
+        struct file *files[2];
        int error;
        int fdw, fdr;
        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
                return -EINVAL;
-        fw = create_write_pipe(flags);
+        error = create_pipe_files(files, flags);
-        if (IS_ERR(fw))
+        if (error)
-                return PTR_ERR(fw);
+                return error;
-        fr = create_read_pipe(fw, flags);
-        error = PTR_ERR(fr);
-        if (IS_ERR(fr))
-                goto err_write_pipe;
        error = get_unused_fd_flags(flags);
        if (error < 0)
@@ -1109,8 +1088,8 @@ int do_pipe_flags(int *fd, int flags)
        fdw = error;
        audit_fd_pair(fdr, fdw);
-        fd_install(fdr, fr);
+        fd_install(fdr, files[0]);
-        fd_install(fdw, fw);
+        fd_install(fdw, files[1]);
        fd[0] = fdr;
        fd[1] = fdw;
@@ -1119,10 +1098,8 @@ int do_pipe_flags(int *fd, int flags)
 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
-        path_put(&fr->f_path);
+        fput(files[0]);
-        put_filp(fr);
+        fput(files[1]);
- err_write_pipe:
-        free_write_pipe(fw);
        return error;
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 36a29b753c79..c495a3055e2a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1589,10 +1589,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
                goto out;
        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;
+        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!dquots[cnt])
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4c0c7d163d15..a98b7740a0fc 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1334,9 +1334,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
        else if (bitmap == 0)
                block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
-        reiserfs_write_unlock(sb);
        bh = sb_bread(sb, block);
-        reiserfs_write_lock(sb);
        if (bh == NULL)
                reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
                                 "reading failed", __func__, block);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6d4268fb6c1..855da58db145 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -76,10 +76,10 @@ void reiserfs_evict_inode(struct inode *inode)
                ;
        }
      out:
+        reiserfs_write_unlock_once(inode->i_sb, depth);
        clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
        dquot_drop(inode);
        inode->i_blocks = 0;
-        reiserfs_write_unlock_once(inode->i_sb, depth);
        return;
 no_delete:
diff --git a/fs/splice.c b/fs/splice.c
index 7bf08fa22ec9..41514dd89462 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -996,6 +996,8 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        };
        ssize_t ret;
+        sb_start_write(inode->i_sb);
        pipe_lock(pipe);
        splice_from_pipe_begin(&sd);
@@ -1034,6 +1036,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                        *ppos += ret;
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
+        sb_end_write(inode->i_sb);
        return ret;
 }
diff --git a/fs/super.c b/fs/super.c
index 4bf714459a4b..0902cfa6a12e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,12 +33,19 @@
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
+#include <linux/lockdep.h>
 #include "internal.h"
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+        "sb_writers",
+        "sb_pagefaults",
+        "sb_internal",
+};
 /*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
@@ -102,6 +109,35 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
        return total_objects;
 }
+static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+{
+        int err;
+        int i;
+        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+                err = percpu_counter_init(&s->s_writers.counter[i], 0);
+                if (err < 0)
+                        goto err_out;
+                lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+                                 &type->s_writers_key[i], 0);
+        }
+        init_waitqueue_head(&s->s_writers.wait);
+        init_waitqueue_head(&s->s_writers.wait_unfrozen);
+        return 0;
+err_out:
+        while (--i >= 0)
+                percpu_counter_destroy(&s->s_writers.counter[i]);
+        return err;
+}
+static void destroy_sb_writers(struct super_block *s)
+{
+        int i;
+        for (i = 0; i < SB_FREEZE_LEVELS; i++)
+                percpu_counter_destroy(&s->s_writers.counter[i]);
+}
 /**
 *      alloc_super     -       create new superblock
 *      @type:  filesystem type superblock should belong to
@@ -117,18 +153,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        if (s) {
                if (security_sb_alloc(s)) {
+                        /*
+                         * We cannot call security_sb_free() without
+                         * security_sb_alloc() succeeding. So bail out manually
+                         */
                        kfree(s);
                        s = NULL;
                        goto out;
                }
 #ifdef CONFIG_SMP
                s->s_files = alloc_percpu(struct list_head);
-                if (!s->s_files) {
+                if (!s->s_files)
-                        security_sb_free(s);
+                        goto err_out;
-                        kfree(s);
+                else {
-                        s = NULL;
-                        goto out;
-                } else {
                        int i;
                        for_each_possible_cpu(i)
@@ -137,6 +174,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 #else
                INIT_LIST_HEAD(&s->s_files);
 #endif
+                if (init_sb_writers(s, type))
+                        goto err_out;
                s->s_flags = flags;
                s->s_bdi = &default_backing_dev_info;
                INIT_HLIST_NODE(&s->s_instances);
@@ -178,7 +217,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
                mutex_init(&s->s_dquot.dqio_mutex);
                mutex_init(&s->s_dquot.dqonoff_mutex);
                init_rwsem(&s->s_dquot.dqptr_sem);
-                init_waitqueue_head(&s->s_wait_unfrozen);
                s->s_maxbytes = MAX_NON_LFS;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
@@ -190,6 +228,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        }
 out:
        return s;
+err_out:
+        security_sb_free(s);
+#ifdef CONFIG_SMP
+        if (s->s_files)
+                free_percpu(s->s_files);
+#endif
+        destroy_sb_writers(s);
+        kfree(s);
+        s = NULL;
+        goto out;
 }
 /**
@@ -203,6 +251,7 @@ static inline void destroy_super(struct super_block *s)
 #ifdef CONFIG_SMP
        free_percpu(s->s_files);
 #endif
+        destroy_sb_writers(s);
        security_sb_free(s);
        WARN_ON(!list_empty(&s->s_mounts));
        kfree(s->s_subtype);
@@ -488,46 +537,6 @@ void drop_super(struct super_block *sb)
 EXPORT_SYMBOL(drop_super);
 /**
- * sync_supers - helper for periodic superblock writeback
- *
- * Call the write_super method if present on all dirty superblocks in
- * the system.  This is for the periodic writeback used by most older
- * filesystems.  For data integrity superblock writeback use
- * sync_filesystems() instead.
- *
- * Note: check the dirty flag before waiting, so we don't
- * hold up the sync while mounting a device. (The newly
- * mounted device won't need syncing.)
- */
-void sync_supers(void)
-{
-        struct super_block *sb, *p = NULL;
-        spin_lock(&sb_lock);
-        list_for_each_entry(sb, &super_blocks, s_list) {
-                if (hlist_unhashed(&sb->s_instances))
-                        continue;
-                if (sb->s_op->write_super && sb->s_dirt) {
-                        sb->s_count++;
-                        spin_unlock(&sb_lock);
-                        down_read(&sb->s_umount);
-                        if (sb->s_root && sb->s_dirt && (sb->s_flags & MS_BORN))
-                                sb->s_op->write_super(sb);
-                        up_read(&sb->s_umount);
-                        spin_lock(&sb_lock);
-                        if (p)
-                                __put_super(p);
-                        p = sb;
-                }
-        }
-        if (p)
-                __put_super(p);
-        spin_unlock(&sb_lock);
-}
-/**
 *      iterate_supers - call function for all active superblocks
 *      @f: function to call
 *      @arg: argument to pass to it
@@ -651,10 +660,11 @@ struct super_block *get_super_thawed(struct block_device *bdev)
 {
        while (1) {
                struct super_block *s = get_super(bdev);
-                if (!s || s->s_frozen == SB_UNFROZEN)
+                if (!s || s->s_writers.frozen == SB_UNFROZEN)
                        return s;
                up_read(&s->s_umount);
-                vfs_check_frozen(s, SB_FREEZE_WRITE);
+                wait_event(s->s_writers.wait_unfrozen,
+                           s->s_writers.frozen == SB_UNFROZEN);
                put_super(s);
        }
 }
@@ -732,7 +742,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        int retval;
        int remount_ro;
-        if (sb->s_frozen != SB_UNFROZEN)
+        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;
 #ifdef CONFIG_BLOCK
@@ -1163,6 +1173,120 @@ out:
        return ERR_PTR(error);
 }
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+        percpu_counter_dec(&sb->s_writers.counter[level-1]);
+        /*
+         * Make sure s_writers are updated before we wake up waiters in
+         * freeze_super().
+         */
+        smp_mb();
+        if (waitqueue_active(&sb->s_writers.wait))
+                wake_up(&sb->s_writers.wait);
+        rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+                                unsigned long ip)
+{
+        int i;
+        if (!trylock) {
+                for (i = 0; i < level - 1; i++)
+                        if (lock_is_held(&sb->s_writers.lock_map[i])) {
+                                trylock = true;
+                                break;
+                        }
+        }
+        rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+        if (unlikely(sb->s_writers.frozen >= level)) {
+                if (!wait)
+                        return 0;
+                wait_event(sb->s_writers.wait_unfrozen,
+                           sb->s_writers.frozen < level);
+        }
+#ifdef CONFIG_LOCKDEP
+        acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+        percpu_counter_inc(&sb->s_writers.counter[level-1]);
+        /*
+         * Make sure counter is updated before we check for frozen.
+         * freeze_super() first sets frozen and then checks the counter.
+         */
+        smp_mb();
+        if (unlikely(sb->s_writers.frozen >= level)) {
+                __sb_end_write(sb, level);
+                goto retry;
+        }
+        return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+        s64 writers;
+        /*
+         * We just cycle-through lockdep here so that it does not complain
+         * about returning with lock to userspace
+         */
+        rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+        rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+        do {
+                DEFINE_WAIT(wait);
+                /*
+                 * We use a barrier in prepare_to_wait() to separate setting
+                 * of frozen and checking of the counter
+                 */
+                prepare_to_wait(&sb->s_writers.wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+                if (writers)
+                        schedule();
+                finish_wait(&sb->s_writers.wait, &wait);
+        } while (writers);
+}
 /**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
@@ -1170,6 +1294,31 @@ out:
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
 * -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
 */
 int freeze_super(struct super_block *sb)
 {
@@ -1177,7 +1326,7 @@ int freeze_super(struct super_block *sb)
        atomic_inc(&sb->s_active);
        down_write(&sb->s_umount);
-        if (sb->s_frozen) {
+        if (sb->s_writers.frozen != SB_UNFROZEN) {
                deactivate_locked_super(sb);
                return -EBUSY;
        }
@@ -1188,33 +1337,53 @@ int freeze_super(struct super_block *sb)
        }
        if (sb->s_flags & MS_RDONLY) {
-                sb->s_frozen = SB_FREEZE_TRANS;
+                /* Nothing to do really... */
-                smp_wmb();
+                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                up_write(&sb->s_umount);
                return 0;
        }
-        sb->s_frozen = SB_FREEZE_WRITE;
+        /* From now on, no new normal writers can start */
+        sb->s_writers.frozen = SB_FREEZE_WRITE;
        smp_wmb();
+        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
+        up_write(&sb->s_umount);
+        sb_wait_write(sb, SB_FREEZE_WRITE);
+        /* Now we go and block page faults... */
+        down_write(&sb->s_umount);
+        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
+        smp_wmb();
+        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+        /* All writers are done so after syncing there won't be dirty data */
        sync_filesystem(sb);
-        sb->s_frozen = SB_FREEZE_TRANS;
+        /* Now wait for internal filesystem counter */
+        sb->s_writers.frozen = SB_FREEZE_FS;
        smp_wmb();
+        sb_wait_write(sb, SB_FREEZE_FS);
-        sync_blockdev(sb->s_bdev);
        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
-                        sb->s_frozen = SB_UNFROZEN;
+                        sb->s_writers.frozen = SB_UNFROZEN;
                        smp_wmb();
-                        wake_up(&sb->s_wait_unfrozen);
+                        wake_up(&sb->s_writers.wait_unfrozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
+        /*
+         * This is just for debugging purposes so that fs can warn if it
+         * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+         */
+        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        up_write(&sb->s_umount);
        return 0;
 }
@@ -1231,7 +1400,7 @@ int thaw_super(struct super_block *sb)
        int error;
        down_write(&sb->s_umount);
-        if (sb->s_frozen == SB_UNFROZEN) {
+        if (sb->s_writers.frozen == SB_UNFROZEN) {
                up_write(&sb->s_umount);
                return -EINVAL;
        }
@@ -1244,16 +1413,15 @@ int thaw_super(struct super_block *sb)
                if (error) {
                        printk(KERN_ERR
                                "VFS:Filesystem thaw failed\n");
-                        sb->s_frozen = SB_FREEZE_TRANS;
                        up_write(&sb->s_umount);
                        return error;
                }
        }
 out:
-        sb->s_frozen = SB_UNFROZEN;
+        sb->s_writers.frozen = SB_UNFROZEN;
        smp_wmb();
-        wake_up(&sb->s_wait_unfrozen);
+        wake_up(&sb->s_writers.wait_unfrozen);
        deactivate_locked_super(sb);
        return 0;
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index a4759833d62d..614b2b544880 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -228,6 +228,8 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        ret = 0;
        if (bb->vm_ops->page_mkwrite)
                ret = bb->vm_ops->page_mkwrite(vma, vmf);
+        else
+                file_update_time(file);
        sysfs_put_active(attr_sd);
        return ret;
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8b8cc4e945f4..760de723dadb 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -167,7 +167,7 @@ struct ubifs_global_debug_info {
 #define ubifs_dbg_msg(type, fmt, ...) \
        pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
-#define DBG_KEY_BUF_LEN 32
+#define DBG_KEY_BUF_LEN 48
 #define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
        char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
        pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__,             \
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 35389ca2d267..7bd6e72afd11 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -37,11 +37,11 @@
 *
 * A thing to keep in mind: inode @i_mutex is locked in most VFS operations we
 * implement. However, this is not true for 'ubifs_writepage()', which may be
- * called with @i_mutex unlocked. For example, when pdflush is doing background
+ * called with @i_mutex unlocked. For example, when flusher thread is doing
- * write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex. At "normal"
+ * background write-back, it calls 'ubifs_writepage()' with unlocked @i_mutex.
- * work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g. in the
+ * At "normal" work-paths the @i_mutex is locked in 'ubifs_writepage()', e.g.
- * "sys_write -> alloc_pages -> direct reclaim path". So, in 'ubifs_writepage()'
+ * in the "sys_write -> alloc_pages -> direct reclaim path". So, in
- * we are only guaranteed that the page is locked.
+ * 'ubifs_writepage()' we are only guaranteed that the page is locked.
 *
 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index ce33b2beb151..8640920766ed 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1749,7 +1749,10 @@ int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
        return 0;
 out_err:
-        ubifs_lpt_free(c, 0);
+        if (wr)
+                ubifs_lpt_free(c, 1);
+        if (rd)
+                ubifs_lpt_free(c, 0);
        return err;
 }
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c30d976b4be8..edeec499c048 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -788,7 +788,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
 corrupted_rescan:
        /* Re-scan the corrupted data with verbose messages */
-        ubifs_err("corruptio %d", ret);
+        ubifs_err("corruption %d", ret);
        ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
        ubifs_scanned_corruption(c, lnum, offs, buf);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index eba46d4a7619..94d78fc5d4e0 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1026,7 +1026,6 @@ int ubifs_replay_journal(struct ubifs_info *c)
        c->replaying = 1;
        lnum = c->ltail_lnum = c->lhead_lnum;
-        lnum = UBIFS_LOG_LNUM;
        do {
                err = replay_log_leb(c, lnum, 0, c->sbuf);
                if (err == 1)
@@ -1035,7 +1034,7 @@ int ubifs_replay_journal(struct ubifs_info *c)
                if (err)
                        goto out;
                lnum = ubifs_next_log_lnum(c, lnum);
-        } while (lnum != UBIFS_LOG_LNUM);
+        } while (lnum != c->ltail_lnum);
        err = replay_buds(c);
        if (err)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1c766c39c038..71a197f0f93d 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -303,7 +303,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
        mutex_lock(&ui->ui_mutex);
        /*
         * Due to races between write-back forced by budgeting
-         * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+         * (see 'sync_some_inodes()') and background write-back, the inode may
         * have already been synchronized, do not do this again. This might
         * also happen if it was synchronized in an VFS operation, e.g.
         * 'ubifs_link()'.
@@ -1157,9 +1157,6 @@ static int check_free_space(struct ubifs_info *c)
 *
 * This function mounts UBIFS file system. Returns zero in case of success and
 * a negative error code in case of failure.
- *
- * Note, the function does not de-allocate resources it it fails half way
- * through, and the caller has to do this instead.
 */
 static int mount_ubifs(struct ubifs_info *c)
 {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7f3f7ba3df6e..d1c6093fd3d3 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -39,20 +39,24 @@
 #include "udf_i.h"
 #include "udf_sb.h"
-static int udf_adinicb_readpage(struct file *file, struct page *page)
+static void __udf_adinicb_readpage(struct page *page)
 {
        struct inode *inode = page->mapping->host;
        char *kaddr;
        struct udf_inode_info *iinfo = UDF_I(inode);
-        BUG_ON(!PageLocked(page));
        kaddr = kmap(page);
-        memset(kaddr, 0, PAGE_CACHE_SIZE);
        memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
+        memset(kaddr + inode->i_size, 0, PAGE_CACHE_SIZE - inode->i_size);
        flush_dcache_page(page);
        SetPageUptodate(page);
        kunmap(page);
+}
+static int udf_adinicb_readpage(struct file *file, struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        __udf_adinicb_readpage(page);
        unlock_page(page);
        return 0;
@@ -77,6 +81,25 @@ static int udf_adinicb_writepage(struct page *page,
        return 0;
 }
+static int udf_adinicb_write_begin(struct file *file,
+                        struct address_space *mapping, loff_t pos,
+                        unsigned len, unsigned flags, struct page **pagep,
+                        void **fsdata)
+{
+        struct page *page;
+        if (WARN_ON_ONCE(pos >= PAGE_CACHE_SIZE))
+                return -EIO;
+        page = grab_cache_page_write_begin(mapping, 0, flags);
+        if (!page)
+                return -ENOMEM;
+        *pagep = page;
+        if (!PageUptodate(page) && len != PAGE_CACHE_SIZE)
+                __udf_adinicb_readpage(page);
+        return 0;
+}
 static int udf_adinicb_write_end(struct file *file,
                        struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
@@ -98,8 +121,8 @@ static int udf_adinicb_write_end(struct file *file,
 const struct address_space_operations udf_adinicb_aops = {
        .readpage       = udf_adinicb_readpage,
        .writepage      = udf_adinicb_writepage,
-        .write_begin = simple_write_begin,
+        .write_begin    = udf_adinicb_write_begin,
-        .write_end = udf_adinicb_write_end,
+        .write_end      = udf_adinicb_write_end,
 };
 static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index fafaad795cd6..aa233469b3c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1124,14 +1124,17 @@ int udf_setsize(struct inode *inode, loff_t newsize)
                                if (err)
                                        return err;
                                down_write(&iinfo->i_data_sem);
-                        } else
+                        } else {
                                iinfo->i_lenAlloc = newsize;
+                                goto set_size;
+                        }
                }
                err = udf_extend_file(inode, newsize);
                if (err) {
                        up_write(&iinfo->i_data_sem);
                        return err;
                }
+set_size:
                truncate_setsize(inode, newsize);
                up_write(&iinfo->i_data_sem);
        } else {
diff --git a/fs/udf/super.c b/fs/udf/super.c
index dcbf98722afc..18fc038a438d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1344,6 +1344,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                udf_err(sb, "error loading logical volume descriptor: "
                        "Partition table too long (%u > %lu)\n", table_len,
                        sb->s_blocksize - sizeof(*lvd));
+                ret = 1;
                goto out_bh;
        }
@@ -1388,8 +1389,10 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
                                                UDF_ID_SPARABLE,
                                                strlen(UDF_ID_SPARABLE))) {
                                if (udf_load_sparable_map(sb, map,
-                                    (struct sparablePartitionMap *)gpm) < 0)
+                                    (struct sparablePartitionMap *)gpm) < 0) {
+                                        ret = 1;
                                        goto out_bh;
+                                }
                        } else if (!strncmp(upm2->partIdent.ident,
                                                UDF_ID_METADATA,
                                                strlen(UDF_ID_METADATA))) {
@@ -2000,6 +2003,8 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
                        if (!silent)
                                pr_notice("Rescanning with blocksize %d\n",
                                          UDF_DEFAULT_BLOCKSIZE);
+                        brelse(sbi->s_lvid_bh);
+                        sbi->s_lvid_bh = NULL;
                        uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
                        ret = udf_load_vrs(sb, &uopt, silent, &fileset);
                }
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 15052ff916ec..e562dd43f41f 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,6 +124,12 @@ xfs_setfilesize_trans_alloc(
        ioend->io_append_trans = tp;
        /*
+         * We will pass freeze protection with a transaction.  So tell lockdep
+         * we released it.
+         */
+        rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                      1, _THIS_IP_);
+        /*
         * We hand off the transaction to the completion thread now, so
         * clear the flag here.
         */
@@ -199,6 +205,15 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
+        if (ioend->io_append_trans) {
+                /*
+                 * We've got freeze protection passed with the transaction.
+                 * Tell lockdep about it.
+                 */
+                rwsem_acquire_read(
+                        &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                        0, 1, _THIS_IP_);
+        }
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                ioend->io_error = -EIO;
                goto done;
@@ -1425,6 +1440,9 @@ out_trans_cancel:
        if (ioend->io_append_trans) {
                current_set_flags_nested(&ioend->io_append_trans->t_pflags,
                                         PF_FSTRANS);
+                rwsem_acquire_read(
+                        &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                        0, 1, _THIS_IP_);
                xfs_trans_cancel(ioend->io_append_trans, 0);
        }
 out_destroy_ioend:
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index f9c3fe304a17..69cf4fcde03e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -179,12 +179,14 @@ xfs_ioc_trim(
         * used by the fstrim application.  In the end it really doesn't
         * matter as trimming blocks is an advisory interface.
         */
+        if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
+            range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+                return -XFS_ERROR(EINVAL);
        start = BTOBB(range.start);
        end = start + BTOBBT(range.len) - 1;
        minlen = BTOBB(max_t(u64, granularity, range.minlen));
-        if (XFS_BB_TO_FSB(mp, start) >= mp->m_sb.sb_dblocks)
-                return -XFS_ERROR(EINVAL);
        if (end > XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1)
                end = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)- 1;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c4559c6e6f2c..56afcdb2377d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -770,10 +770,12 @@ xfs_file_aio_write(
        if (ocount == 0)
                return 0;
-        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+        sb_start_write(inode->i_sb);
-        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                return -EIO;
+                ret = -EIO;
+                goto out;
+        }
        if (unlikely(file->f_flags & O_DIRECT))
                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
@@ -792,6 +794,8 @@ xfs_file_aio_write(
                        ret = err;
        }
+out:
+        sb_end_write(inode->i_sb);
        return ret;
 }
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 21e37b55f7e5..5aceb3f8ecd6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -962,23 +962,22 @@ xfs_dialloc(
                if (!pag->pagi_freecount && !okalloc)
                        goto nextag;
+                /*
+                 * Then read in the AGI buffer and recheck with the AGI buffer
+                 * lock held.
+                 */
                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
                if (error)
                        goto out_error;
-                /*
-                 * Once the AGI has been read in we have to recheck
-                 * pagi_freecount with the AGI buffer lock held.
-                 */
                if (pag->pagi_freecount) {
                        xfs_perag_put(pag);
                        goto out_alloc;
                }
-                if (!okalloc) {
+                if (!okalloc)
-                        xfs_trans_brelse(tp, agbp);
+                        goto nextag_relse_buffer;
-                        goto nextag;
-                }
                error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
                if (error) {
@@ -1007,6 +1006,8 @@ xfs_dialloc(
                        return 0;
                }
+nextag_relse_buffer:
+                xfs_trans_brelse(tp, agbp);
 nextag:
                xfs_perag_put(pag);
                if (++agno == mp->m_sb.sb_agcount)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 1f1535d25a9b..0e0232c3b6d9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -364,9 +364,15 @@ xfs_fssetdm_by_handle(
        if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
                return -XFS_ERROR(EFAULT);
+        error = mnt_want_write_file(parfilp);
+        if (error)
+                return error;
        dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
-        if (IS_ERR(dentry))
+        if (IS_ERR(dentry)) {
+                mnt_drop_write_file(parfilp);
                return PTR_ERR(dentry);
+        }
        if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
                error = -XFS_ERROR(EPERM);
@@ -382,6 +388,7 @@ xfs_fssetdm_by_handle(
                                 fsd.fsd_dmstate);
 out:
+        mnt_drop_write_file(parfilp);
        dput(dentry);
        return error;
 }
@@ -634,7 +641,11 @@ xfs_ioc_space(
        if (ioflags & IO_INVIS)
                attr_flags |= XFS_ATTR_DMI;
+        error = mnt_want_write_file(filp);
+        if (error)
+                return error;
        error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+        mnt_drop_write_file(filp);
        return -error;
 }
@@ -1163,6 +1174,7 @@ xfs_ioc_fssetxattr(
 {
        struct fsxattr          fa;
        unsigned int            mask;
+        int error;
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
@@ -1171,7 +1183,12 @@ xfs_ioc_fssetxattr(
        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
                mask |= FSX_NONBLOCK;
-        return -xfs_ioctl_setattr(ip, &fa, mask);
+        error = mnt_want_write_file(filp);
+        if (error)
+                return error;
+        error = xfs_ioctl_setattr(ip, &fa, mask);
+        mnt_drop_write_file(filp);
+        return -error;
 }
 STATIC int
@@ -1196,6 +1213,7 @@ xfs_ioc_setxflags(
        struct fsxattr          fa;
        unsigned int            flags;
        unsigned int            mask;
+        int error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -1210,7 +1228,12 @@ xfs_ioc_setxflags(
                mask |= FSX_NONBLOCK;
        fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-        return -xfs_ioctl_setattr(ip, &fa, mask);
+        error = mnt_want_write_file(filp);
+        if (error)
+                return error;
+        error = xfs_ioctl_setattr(ip, &fa, mask);
+        mnt_drop_write_file(filp);
+        return -error;
 }
 STATIC int
@@ -1385,8 +1408,13 @@ xfs_file_ioctl(
                if (copy_from_user(&dmi, arg, sizeof(dmi)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
                                dmi.fsd_dmstate);
+                mnt_drop_write_file(filp);
                return -error;
        }
@@ -1434,7 +1462,11 @@ xfs_file_ioctl(
                if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_swapext(&sxp);
+                mnt_drop_write_file(filp);
                return -error;
        }
@@ -1463,9 +1495,14 @@ xfs_file_ioctl(
                if (copy_from_user(&inout, arg, sizeof(inout)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                /* input parameter is passed in resblks field of structure */
                in = inout.resblks;
                error = xfs_reserve_blocks(mp, &in, &inout);
+                mnt_drop_write_file(filp);
                if (error)
                        return -error;
@@ -1496,7 +1533,11 @@ xfs_file_ioctl(
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_growfs_data(mp, &in);
+                mnt_drop_write_file(filp);
                return -error;
        }
@@ -1506,7 +1547,11 @@ xfs_file_ioctl(
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_growfs_log(mp, &in);
+                mnt_drop_write_file(filp);
                return -error;
        }
@@ -1516,7 +1561,11 @@ xfs_file_ioctl(
                if (copy_from_user(&in, arg, sizeof(in)))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_growfs_rt(mp, &in);
+                mnt_drop_write_file(filp);
                return -error;
        }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index c4f2da0d2bf5..1244274a5674 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -600,7 +600,11 @@ xfs_file_compat_ioctl(
                if (xfs_compat_growfs_data_copyin(&in, arg))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_growfs_data(mp, &in);
+                mnt_drop_write_file(filp);
                return -error;
        }
        case XFS_IOC_FSGROWFSRT_32: {
@@ -608,7 +612,11 @@ xfs_file_compat_ioctl(
                if (xfs_compat_growfs_rt_copyin(&in, arg))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_growfs_rt(mp, &in);
+                mnt_drop_write_file(filp);
                return -error;
        }
 #endif
@@ -627,7 +635,11 @@ xfs_file_compat_ioctl(
                                   offsetof(struct xfs_swapext, sx_stat)) ||
                    xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
                        return -XFS_ERROR(EFAULT);
+                error = mnt_want_write_file(filp);
+                if (error)
+                        return error;
                error = xfs_swapext(&sxp);
+                mnt_drop_write_file(filp);
                return -error;
        }
        case XFS_IOC_FSBULKSTAT_32:
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 915edf6639f0..973dff6ad935 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -680,9 +680,9 @@ xfs_iomap_write_unwritten(
                 * the same inode that we complete here and might deadlock
                 * on the iolock.
                 */
-                xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+                sb_start_intwrite(mp->m_super);
                tp = _xfs_trans_alloc(mp, XFS_TRANS_STRAT_WRITE, KM_NOFS);
-                tp->t_flags |= XFS_TRANS_RESERVE;
+                tp->t_flags |= XFS_TRANS_RESERVE | XFS_TRANS_FREEZE_PROT;
                error = xfs_trans_reserve(tp, resblks,
                                XFS_WRITE_LOG_RES(mp), 0,
                                XFS_TRANS_PERM_LOG_RES,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 711ca51ca3d7..29c2f83d4147 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1551,7 +1551,7 @@ xfs_unmountfs(
 int
 xfs_fs_writable(xfs_mount_t *mp)
 {
-        return !(xfs_test_for_freeze(mp) || XFS_FORCED_SHUTDOWN(mp) ||
+        return !(mp->m_super->s_writers.frozen || XFS_FORCED_SHUTDOWN(mp) ||
                (mp->m_flags & XFS_MOUNT_RDONLY));
 }
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 8724336a9a08..05a05a7b6119 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -311,9 +311,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 #define SHUTDOWN_REMOTE_REQ     0x0010  /* shutdown came from remote cell */
 #define SHUTDOWN_DEVICE_REQ     0x0020  /* failed all paths to the device */
-#define xfs_test_for_freeze(mp)         ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l)       vfs_check_frozen((mp)->m_super, (l))
 /*
 * Flags for xfs_mountfs
 */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 92d4331cd4f1..ca28a4ba4b54 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -857,7 +857,7 @@ xfs_rtbuf_get(
        xfs_buf_t       *bp;            /* block buffer, result */
        xfs_inode_t     *ip;            /* bitmap or summary inode */
        xfs_bmbt_irec_t map;
-        int             nmap;
+        int             nmap = 1;
        int             error;          /* error value */
        ip = issum ? mp->m_rsumip : mp->m_rbmip;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 97304f10e78a..96548176db80 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -403,7 +403,7 @@ xfs_sync_worker(
        if (!(mp->m_super->s_flags & MS_ACTIVE) &&
            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
                /* dgc: errors ignored here */
-                if (mp->m_super->s_frozen == SB_UNFROZEN &&
+                if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
                        error = xfs_fs_log_dummy(mp);
                else
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdf324508c5e..06ed520a767f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -576,8 +576,12 @@ xfs_trans_alloc(
        xfs_mount_t     *mp,
        uint            type)
 {
-        xfs_wait_for_freeze(mp, SB_FREEZE_TRANS);
+        xfs_trans_t     *tp;
-        return _xfs_trans_alloc(mp, type, KM_SLEEP);
+        sb_start_intwrite(mp->m_super);
+        tp = _xfs_trans_alloc(mp, type, KM_SLEEP);
+        tp->t_flags |= XFS_TRANS_FREEZE_PROT;
+        return tp;
 }
 xfs_trans_t *
@@ -588,6 +592,7 @@ _xfs_trans_alloc(
 {
        xfs_trans_t     *tp;
+        WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
        atomic_inc(&mp->m_active_trans);
        tp = kmem_zone_zalloc(xfs_trans_zone, memflags);
@@ -611,6 +616,8 @@ xfs_trans_free(
        xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
        atomic_dec(&tp->t_mountp->m_active_trans);
+        if (tp->t_flags & XFS_TRANS_FREEZE_PROT)
+                sb_end_intwrite(tp->t_mountp->m_super);
        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
 }
@@ -643,7 +650,11 @@ xfs_trans_dup(
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(tp->t_ticket != NULL);
-        ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
+        ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
+                       (tp->t_flags & XFS_TRANS_RESERVE) |
+                       (tp->t_flags & XFS_TRANS_FREEZE_PROT);
+        /* We gave our writer reference to the new transaction */
+        tp->t_flags &= ~XFS_TRANS_FREEZE_PROT;
        ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
        ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
        tp->t_blk_res = tp->t_blk_res_used;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index bc2afd52a0b7..db056544cbb5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -179,6 +179,8 @@ struct xfs_log_item_desc {
 #define XFS_TRANS_SYNC          0x08    /* make commit synchronous */
 #define XFS_TRANS_DQ_DIRTY      0x10    /* at least one dquot in trx dirty */
 #define XFS_TRANS_RESERVE       0x20    /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT   0x40    /* Transaction has elevated writer
+                                           count in superblock */
 /*
 * Values for call flags parameter.