33 files changed, 519 insertions, 206 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f6..5a3327b8f90d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3997,7 +3997,7 @@ again:
         * We make the other tasks wait for the flush only when we can flush
         * all things.
         */
-        if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                flushing = true;
                space_info->flush = 1;
        }
@@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        unsigned nr_extents = 0;
        int extra_reserve = 0;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
-        int ret;
+        int ret = 0;
        bool delalloc_lock = true;
        /* If we are a free space inode we need to not flush since we will be in
@@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
-        if (root->fs_info->quota_enabled) {
+        if (root->fs_info->quota_enabled)
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
-                if (ret) {
-                        spin_lock(&BTRFS_I(inode)->lock);
-                        calc_csum_metadata_size(inode, num_bytes, 0);
-                        spin_unlock(&BTRFS_I(inode)->lock);
-                        if (delalloc_lock)
-                                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-                        return ret;
-                }
-        }
-        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+        /*
+         * ret != 0 here means the qgroup reservation failed, we go straight to
+         * the shared error handling then.
+         */
+        if (ret == 0)
+                ret = reserve_metadata_bytes(root, block_rsv,
+                                             to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
                unsigned dropped;
@@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int empty_cluster = 2 * 1024 * 1024;
        struct btrfs_space_info *space_info;
        int loop = 0;
-        int index = 0;
+        int index = __get_raid_index(data);
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
@@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                                       &wc->flags[level]);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return ret;
                        }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return 1;
                        }
                }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7f..fdb7a8db3b57 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
                return 0;
+        if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+            test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+                return 0;
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
@@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
        if (!em)
                goto out;
-        list_move(&em->list, &tree->modified_extents);
+        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+                list_move(&em->list, &tree->modified_extents);
        em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
        em->mod_start = em->start;
@@ -280,6 +285,13 @@ out:
 }
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+        clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+        if (em->in_tree)
+                try_merge_map(tree, em);
+}
 /**
 * add_extent_mapping - add new extent map to the extent tree
 * @tree:       tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e8..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
 int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef42358..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                if (!contig)
                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-                if (!contig && (offset >= ordered->file_offset + ordered->len ||
+                if (offset >= ordered->file_offset + ordered->len ||
-                    offset < ordered->file_offset)) {
+                    offset < ordered->file_offset) {
                        unsigned long bytes_left;
                        sums->len = this_sum_bytes;
                        this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77061bf43edb..aeb84469d2c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
        int num_defrag;
+        int index;
+        int ret;
        /* get the inode */
        key.objectid = defrag->root;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        key.offset = (u64)-1;
+        index = srcu_read_lock(&fs_info->subvol_srcu);
        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
        if (IS_ERR(inode_root)) {
-                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                ret = PTR_ERR(inode_root);
-                return PTR_ERR(inode_root);
+                goto cleanup;
+        }
+        if (btrfs_root_refs(&inode_root->root_item) == 0) {
+                ret = -ENOENT;
+                goto cleanup;
        }
        key.objectid = defrag->ino;
@@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
        if (IS_ERR(inode)) {
-                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                ret = PTR_ERR(inode);
-                return PTR_ERR(inode);
+                goto cleanup;
        }
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
        /* do a chunk of defrag */
        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        iput(inode);
        return 0;
+cleanup:
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
+        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        return ret;
 }
 /*
@@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                if (err < 0 && num_written > 0)
                        num_written = err;
        }
-out:
        if (sync)
                atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
@@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
        if (lockend <= lockstart)
                lockend = lockstart + root->sectorsize;
+        lockend--;
        len = lockend - lockstart + 1;
        len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
                                        }
                                }
-                                *offset = start;
+                                if (!test_bit(EXTENT_FLAG_PREALLOC,
-                                free_extent_map(em);
+                                              &em->flags)) {
-                                break;
+                                        *offset = start;
+                                        free_extent_map(em);
+                                        break;
+                                }
                        }
                }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
-        int ret = 0;
+        int ret;
+        bool re_search = false;
        spin_lock(&ctl->tree_lock);
 again:
+        ret = 0;
        if (!bytes)
                goto out_lock;
@@ -1879,17 +1881,17 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        /* the tree logging code might be calling us before we
+                        /*
-                         * have fully loaded the free space rbtree for this
+                         * If we found a partial bit of our free space in a
-                         * block group.  So it is possible the entry won't
+                         * bitmap but then couldn't find the other part this may
-                         * be in the rbtree yet at all.  The caching code
+                         * be a problem, so WARN about it.
-                         * will make sure not to put it in the rbtree if
-                         * the logging code has pinned it.
                         */
+                        WARN_ON(re_search);
                        goto out_lock;
                }
        }
+        re_search = false;
        if (!info->bitmap) {
                unlink_free_space(ctl, info);
                if (offset == info->offset) {
@@ -1935,8 +1937,10 @@ again:
        }
        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
-        if (ret == -EAGAIN)
+        if (ret == -EAGAIN) {
+                re_search = true;
                goto again;
+        }
        BUG_ON(ret); /* logic error */
 out_lock:
        spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16d9e8e191e6..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
@@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+                        /* 1 for the orphan item deletion. */
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
+                        ret = btrfs_orphan_add(trans, inode);
+                        btrfs_end_transaction(trans, root);
+                        if (ret)
+                                goto out;
                        ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
@@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                block_end - cur_offset, 0);
                if (IS_ERR(em)) {
                        err = PTR_ERR(em);
+                        em = NULL;
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
@@ -3748,16 +3761,27 @@ next:
        return err;
 }
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
+        loff_t newsize = attr->ia_size;
+        int mask = attr->ia_valid;
        int ret;
        if (newsize == oldsize)
                return 0;
+        /*
+         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+         * special case where we need to update the times despite not having
+         * these flags set.  For all other operations the VFS set these flags
+         * explicitly if it wants a timestamp update.
+         */
+        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
        if (newsize > oldsize) {
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
+                /*
+                 * 1 for the orphan item we're going to add
+                 * 1 for the orphan item deletion.
+                 */
+                trans = btrfs_start_transaction(root, 2);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                /*
+                 * We need to do this in case we fail at _any_ point during the
+                 * actual truncate.  Once we do the truncate_setsize we could
+                 * invalidate pages which forces any outstanding ordered io to
+                 * be instantly completed which will give us extents that need
+                 * to be truncated.  If we fail to get an orphan inode down we
+                 * could have left over extents that were never meant to live,
+                 * so we need to garuntee from this point on that everything
+                 * will be consistent.
+                 */
+                ret = btrfs_orphan_add(trans, inode);
+                btrfs_end_transaction(trans, root);
+                if (ret)
+                        return ret;
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
                ret = btrfs_truncate(inode);
+                if (ret && inode->i_nlink)
+                        btrfs_orphan_del(NULL, inode);
        }
        return ret;
@@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                err = btrfs_setsize(inode, attr->ia_size);
+                err = btrfs_setsize(inode, attr);
                if (err)
                        return err;
        }
@@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                return em;
        if (em) {
                /*
-                 * if our em maps to a hole, there might
+                 * if our em maps to
-                 * actually be delalloc bytes behind it
+                 * -  a hole or
+                 * -  a pre-alloc extent,
+                 * there might actually be delalloc bytes behind it.
                 */
-                if (em->block_start != EXTENT_MAP_HOLE)
+                if (em->block_start != EXTENT_MAP_HOLE &&
+                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        return em;
                else
                        hole_em = em;
@@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
+                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
                        em->start = range_start;
                        em->len = found;
@@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
        /*
         * 1 for the truncate slack space
-         * 1 for the orphan item we're going to add
-         * 1 for the orphan item deletion
         * 1 for updating the inode.
         */
-        trans = btrfs_start_transaction(root, 4);
+        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
@@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
                                      min_size);
        BUG_ON(ret);
-        ret = btrfs_orphan_add(trans, inode);
-        if (ret) {
-                btrfs_end_transaction(trans, root);
-                goto out;
-        }
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
-        } else if (ret && inode->i_nlink > 0) {
-                /*
-                 * Failed to do the truncate, remove us from the in memory
-                 * orphan list.
-                 */
-                ret = btrfs_orphan_del(NULL, inode);
        }
        if (trans) {
@@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
 */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
        struct inode *inode;
        struct btrfs_delalloc_work *work, *next;
        struct list_head works;
+        struct list_head splice;
        int ret = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
        INIT_LIST_HEAD(&works);
+        INIT_LIST_HEAD(&splice);
+again:
        spin_lock(&root->fs_info->delalloc_lock);
-        while (!list_empty(head)) {
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
-                binode = list_entry(head->next, struct btrfs_inode,
+        while (!list_empty(&splice)) {
+                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
+                list_del_init(&binode->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode)
-                        list_del_init(&binode->delalloc_inodes);
+                        continue;
+                list_add_tail(&binode->delalloc_inodes,
+                              &root->fs_info->delalloc_inodes);
                spin_unlock(&root->fs_info->delalloc_lock);
-                if (inode) {
-                        work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                        if (!work) {
+                if (unlikely(!work)) {
-                                ret = -ENOMEM;
+                        ret = -ENOMEM;
-                                goto out;
+                        goto out;
-                        }
-                        list_add_tail(&work->list, &works);
-                        btrfs_queue_worker(&root->fs_info->flush_workers,
-                                           &work->work);
                }
+                list_add_tail(&work->list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &work->work);
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
        }
        spin_unlock(&root->fs_info->delalloc_lock);
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        spin_lock(&root->fs_info->delalloc_lock);
+        if (!list_empty(&root->fs_info->delalloc_inodes)) {
+                spin_unlock(&root->fs_info->delalloc_lock);
+                goto again;
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
@@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
+        return 0;
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+        if (!list_empty_careful(&splice)) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4b4516770f05..338f2597bf7f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
-        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
        if (async_transid) {
                *async_transid = trans->transid;
@@ -525,6 +524,10 @@ fail:
        }
        if (err && !ret)
                ret = err;
+        if (!ret)
+                d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
        return ret;
 }
@@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                mnt_drop_write_file(file);
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
@@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                ret = -EINVAL;
                goto out_free;
        }
-        if (device->fs_devices && device->fs_devices->seeding) {
+        if (!device->writeable) {
                printk(KERN_INFO "btrfs: resizer unable to apply on "
-                       "seeding device %llu\n",
+                       "readonly device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_free;
@@ -1443,8 +1449,8 @@ out_free:
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
                if (err)
                        goto out_dput;
-                /* check if subvolume may be deleted by a non-root user */
-                err = btrfs_may_delete(dir, dentry, 1);
-                if (err)
-                        goto out_dput;
        }
+        /* check if subvolume may be deleted by a user */
+        err = btrfs_may_delete(dir, dentry, 1);
+        if (err)
+                goto out_dput;
        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                err = -EINVAL;
                goto out_dput;
@@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
-        if (btrfs_root_readonly(root))
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                mnt_drop_write_file(file);
+                return -EINVAL;
        }
-        ret = mnt_want_write_file(file);
-        if (ret) {
+        if (btrfs_root_readonly(root)) {
-                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                ret = -EROFS;
-                           0);
+                goto out;
-                return ret;
        }
        switch (inode->i_mode & S_IFMT) {
@@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                ret = -EINVAL;
        }
 out:
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                mnt_drop_write_file(file);
-                return -EINPROGRESS;
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
+        bool need_unlock; /* for mut. excl. ops lock */
        int ret;
-        int need_to_clear_lock = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        if (ret)
                return ret;
-        mutex_lock(&fs_info->volume_mutex);
+again:
+        if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+                mutex_lock(&fs_info->volume_mutex);
+                mutex_lock(&fs_info->balance_mutex);
+                need_unlock = true;
+                goto locked;
+        }
+        /*
+         * mut. excl. ops lock is locked.  Three possibilites:
+         *   (1) some other op is running
+         *   (2) balance is running
+         *   (3) balance is paused -- special case (think resume)
+         */
        mutex_lock(&fs_info->balance_mutex);
+        if (fs_info->balance_ctl) {
+                /* this is either (2) or (3) */
+                if (!atomic_read(&fs_info->balance_running)) {
+                        mutex_unlock(&fs_info->balance_mutex);
+                        if (!mutex_trylock(&fs_info->volume_mutex))
+                                goto again;
+                        mutex_lock(&fs_info->balance_mutex);
+                        if (fs_info->balance_ctl &&
+                            !atomic_read(&fs_info->balance_running)) {
+                                /* this is (3) */
+                                need_unlock = false;
+                                goto locked;
+                        }
+                        mutex_unlock(&fs_info->balance_mutex);
+                        mutex_unlock(&fs_info->volume_mutex);
+                        goto again;
+                } else {
+                        /* this is (2) */
+                        mutex_unlock(&fs_info->balance_mutex);
+                        ret = -EINPROGRESS;
+                        goto out;
+                }
+        } else {
+                /* this is (1) */
+                mutex_unlock(&fs_info->balance_mutex);
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
+locked:
+        BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
        if (arg) {
                bargs = memdup_user(arg, sizeof(*bargs));
                if (IS_ERR(bargs)) {
                        ret = PTR_ERR(bargs);
-                        goto out;
+                        goto out_unlock;
                }
                if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
                bargs = NULL;
        }
-        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+        if (fs_info->balance_ctl) {
-                        1)) {
-                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                ret = -EINPROGRESS;
                goto out_bargs;
        }
-        need_to_clear_lock = 1;
        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
        if (!bctl) {
@@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        }
 do_balance:
-        ret = btrfs_balance(bctl, bargs);
        /*
-         * bctl is freed in __cancel_balance or in free_fs_info if
+         * Ownership of bctl and mutually_exclusive_operation_running
-         * restriper was paused all the way until unmount
+         * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+         * or, if restriper was paused all the way until unmount, in
+         * free_fs_info.  mutually_exclusive_operation_running is
+         * cleared in __cancel_balance.
         */
+        need_unlock = false;
+        ret = btrfs_balance(bctl, bargs);
        if (arg) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
@@ -3513,12 +3570,12 @@ do_balance:
 out_bargs:
        kfree(bargs);
-out:
+out_unlock:
-        if (need_to_clear_lock)
-                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
-                           0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
+        if (need_unlock)
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
                goto drop_write;
        }
+        if (!sa->qgroupid) {
+                ret = -EINVAL;
+                goto out;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f10731297040..e5ed56729607 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
         * if the disk i_size is already at the inode->i_size, or
         * this ordered extent is inside the disk i_size, we're done
         */
-        if (disk_i_size == i_size || offset <= disk_i_size) {
+        if (disk_i_size == i_size)
+                goto out;
+        /*
+         * We still need to update disk_i_size if outstanding_isize is greater
+         * than disk_i_size.
+         */
+        if (offset <= disk_i_size &&
+            (!ordered || ordered->outstanding_isize <= disk_i_size))
                goto out;
-        }
        /*
         * walk backward from this ordered extent to disk_i_size.
@@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                        break;
                if (test->file_offset >= i_size)
                        break;
-                if (test->file_offset >= disk_i_size) {
+                if (entry_end(test) > disk_i_size) {
                        /*
                         * we don't update disk_i_size now, so record this
                         * undealt i_size. Or we will not know the real
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
                ret = add_relation_rb(fs_info, found_key.objectid,
                                      found_key.offset);
+                if (ret == -ENOENT) {
+                        printk(KERN_WARNING
+                                "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)found_key.offset);
+                        ret = 0;        /* ignore the error */
+                }
                if (ret)
                        goto out;
 next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
        struct btrfs_root *quota_root;
+        struct btrfs_qgroup *qgroup;
        int ret = 0;
        quota_root = fs_info->quota_root;
        if (!quota_root)
                return -EINVAL;
+        /* check if there are no relations to this qgroup */
+        spin_lock(&fs_info->qgroup_lock);
+        qgroup = find_qgroup_rb(fs_info, qgroupid);
+        if (qgroup) {
+                if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+                        spin_unlock(&fs_info->qgroup_lock);
+                        return -EBUSY;
+                }
+        }
+        spin_unlock(&fs_info->qgroup_lock);
        ret = del_qgroup_item(trans, quota_root, qgroupid);
        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(quota_root->fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
        return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..67783e03d121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
        int corrected = 0;
        struct btrfs_key key;
        struct inode *inode = NULL;
+        struct btrfs_fs_info *fs_info;
        u64 end = offset + PAGE_SIZE - 1;
        struct btrfs_root *local_root;
+        int srcu_index;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
-        local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
-        if (IS_ERR(local_root))
+        fs_info = fixup->root->fs_info;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
                return PTR_ERR(local_root);
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
-        inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
        }
        if (PageUptodate(page)) {
-                struct btrfs_fs_info *fs_info;
                if (PageDirty(page)) {
                        /*
                         * we need to write the data to the defect sector. the
@@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
        u64 physical_for_dev_replace;
        u64 len;
        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+        int srcu_index;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
-        if (IS_ERR(local_root))
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
                return PTR_ERR(local_root);
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 54454542ad40..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
                        (unsigned long)nce->ino);
        if (!nce_head) {
                nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
-                if (!nce_head)
+                if (!nce_head) {
+                        kfree(nce);
                        return -ENOMEM;
+                }
                INIT_LIST_HEAD(nce_head);
                ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86c..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                             function, line, errstr);
                return;
        }
-        trans->transaction->aborted = errno;
+        ACCESS_ONCE(trans->transaction->aborted) = errno;
        __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..fc03aa60b684 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -333,12 +333,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
                                          &root->fs_info->trans_block_rsv,
                                          num_bytes, flush);
                if (ret)
-                        return ERR_PTR(ret);
+                        goto reserve_fail;
        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
-        if (!h)
+        if (!h) {
-                return ERR_PTR(-ENOMEM);
+                ret = -ENOMEM;
+                goto alloc_fail;
+        }
        /*
         * If we are JOIN_NOLOCK we're already committing a transaction and
@@ -365,11 +367,7 @@ again:
        if (ret < 0) {
                /* We must get the transaction if we are JOIN_NOLOCK. */
                BUG_ON(type == TRANS_JOIN_NOLOCK);
+                goto join_fail;
-                if (type < TRANS_JOIN_NOLOCK)
-                        sb_end_intwrite(root->fs_info->sb);
-                kmem_cache_free(btrfs_trans_handle_cachep, h);
-                return ERR_PTR(ret);
        }
        cur_trans = root->fs_info->running_transaction;
@@ -410,6 +408,19 @@ got_it:
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
        return h;
+join_fail:
+        if (type < TRANS_JOIN_NOLOCK)
+                sb_end_intwrite(root->fs_info->sb);
+        kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+        if (num_bytes)
+                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+                                        num_bytes);
+reserve_fail:
+        if (qgroup_reserved)
+                btrfs_qgroup_free(root, qgroup_reserved);
+        return ERR_PTR(ret);
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
@@ -1468,7 +1479,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
-        if (cur_trans->aborted) {
+        /* Stop the commit early if ->aborted is set */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
                goto cleanup_transaction;
        }
@@ -1574,6 +1586,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
+        /* ->aborted might be set after the previous check, so check it */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                goto cleanup_transaction;
+        }
        /*
         * the reloc mutex makes sure that we stop
         * the balancing code from coming in and moving
@@ -1657,6 +1674,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
+        /*
+         * The tasks which save the space cache and inode cache may also
+         * update ->aborted, check it.
+         */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                mutex_unlock(&root->fs_info->tree_log_mutex);
+                mutex_unlock(&root->fs_info->reloc_mutex);
+                goto cleanup_transaction;
+        }
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d4..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        if (skip_csum)
                return 0;
+        if (em->compress_type) {
+                csum_offset = 0;
+                csum_len = block_len;
+        }
        /* block start is already adjusted for the file extent offset. */
        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
                                       em->block_start + csum_offset,
@@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                em = list_entry(extents.next, struct extent_map, list);
                list_del_init(&em->list);
-                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                /*
                 * If we had an error we just need to delete everybody from our
                 * private list.
                 */
                if (ret) {
+                        clear_em_logging(tree, em);
                        free_extent_map(em);
                        continue;
                }
@@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                write_unlock(&tree->lock);
                ret = log_one_extent(trans, inode, root, em, path);
-                free_extent_map(em);
                write_lock(&tree->lock);
+                clear_em_logging(tree, em);
+                free_extent_map(em);
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..5cbb7f4b1672 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                }
        } else {
                ret = btrfs_get_bdev_and_sb(device_path,
-                                            FMODE_READ | FMODE_EXCL,
+                                            FMODE_WRITE | FMODE_EXCL,
                                            root->fs_info->bdev_holder, 0,
                                            &bdev, &bh);
                if (ret)
@@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = 0;
        /* Notify udev that device has changed */
-        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+        if (bdev)
+                btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 error_brelse:
        brelse(bh);
@@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        chunk_used = btrfs_block_group_used(&cache->item);
-        user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+        if (bargs->usage == 0)
+                user_thresh = 0;
+        else if (bargs->usage > 100)
+                user_thresh = cache->key.offset;
+        else
+                user_thresh = div_factor_fine(cache->key.offset,
+                                              bargs->usage);
        if (chunk_used < user_thresh)
                ret = 0;
@@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
        unset_balance_control(fs_info);
        ret = del_balance_item(fs_info->tree_root);
        BUG_ON(ret);
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
                __cancel_balance(fs_info);
-        else
+        else {
                kfree(bctl);
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+        }
        return ret;
 }
@@ -3156,7 +3168,6 @@ static int balance_kthread(void *data)
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
-        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
@@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
                return 0;
        }
-        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
        if (IS_ERR(tsk))
                return PTR_ERR(tsk);
@@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
@@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        { 1, 1, 2, 2, 2, 2 /* raid1 */ },
        { 1, 2, 1, 1, 1, 2 /* dup */ },
        { 1, 1, 0, 2, 1, 1 /* raid0 */ },
-        { 1, 1, 0, 1, 1, 1 /* single */ },
+        { 1, 1, 1, 1, 1, 1 /* single */ },
 };
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bfc..210fce2df308 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
 compose_mount_options_err:
        kfree(mountdata);
        mountdata = ERR_PTR(rc);
+        kfree(*devname);
+        *devname = NULL;
        goto compose_mount_options_out;
 }
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 17c3643e5950..12b3da39733b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
        case AF_INET6: {
                struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
-                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
                return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
        }
        default:
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0cb..911649a47dd5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
                return -EINVAL;
-#ifdef CONFIG_COMPAT
+        /*
-        if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
+         * can't compare against COMPAT/dlm_write_request32 because
-#else
+         * we don't yet know if is64bit is zero
+         */
        if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
-#endif
                return -EINVAL;
        kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index b906ed17a839..9802de0f85e6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        int lvb_needs_unlock = 0;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
@@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
        gfs2_update_request_times(gl);
        /* don't want to skip dlm_unlock writing the lvb when lock is ex */
+        if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+                lvb_needs_unlock = 1;
        if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
-            gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+            !lvb_needs_unlock) {
                gfs2_glock_free(gl);
                return;
        }
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dd057bc6b65b..fc8dc20fdeb9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -177,11 +177,31 @@ out_nofree:
        return mnt;
 }
+static int
+nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        if (NFS_FH(dentry->d_inode)->size != 0)
+                return nfs_getattr(mnt, dentry, stat);
+        generic_fillattr(dentry->d_inode, stat);
+        return 0;
+}
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        if (NFS_FH(dentry->d_inode)->size != 0)
+                return nfs_setattr(dentry, attr);
+        return -EACCES;
+}
 const struct inode_operations nfs_mountpoint_inode_operations = {
        .getattr        = nfs_getattr,
+        .setattr        = nfs_setattr,
 };
 const struct inode_operations nfs_referral_inode_operations = {
+        .getattr        = nfs_namespace_getattr,
+        .setattr        = nfs_namespace_setattr,
 };
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index acc347268124..2e9779b58b7a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
        error = nfs4_discover_server_trunking(clp, &old);
        if (error < 0)
                goto error;
+        nfs_put_client(clp);
        if (clp != old) {
                clp->cl_preserve_clid = true;
-                nfs_put_client(clp);
                clp = old;
-                atomic_inc(&clp->cl_count);
        }
        return clp;
@@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
                .clientid       = new->cl_clientid,
                .confirm        = new->cl_confirm,
        };
-        int status;
+        int status = -NFS4ERR_STALE_CLIENTID;
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,
                if (prev)
                        nfs_put_client(prev);
+                prev = pos;
                status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
-                if (status == 0) {
+                switch (status) {
+                case -NFS4ERR_STALE_CLIENTID:
+                        break;
+                case 0:
                        nfs4_swap_callback_idents(pos, new);
-                        nfs_put_client(pos);
+                        prev = NULL;
                        *result = pos;
                        dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
                                __func__, pos, atomic_read(&pos->cl_count));
-                        return 0;
+                default:
-                }
+                        goto out;
-                if (status != -NFS4ERR_STALE_CLIENTID) {
-                        nfs_put_client(pos);
-                        dprintk("NFS: <-- %s status = %d, no result\n",
-                                __func__, status);
-                        return status;
                }
                spin_lock(&nn->nfs_client_lock);
-                prev = pos;
        }
+        spin_unlock(&nn->nfs_client_lock);
-        /*
+        /* No match found. The server lost our clientid */
-         * No matching nfs_client found.  This should be impossible,
+out:
-         * because the new nfs_client has already been added to
-         * nfs_client_list by nfs_get_client().
-         *
-         * Don't BUG(), since the caller is holding a mutex.
-         */
        if (prev)
                nfs_put_client(prev);
-        spin_unlock(&nn->nfs_client_lock);
+        dprintk("NFS: <-- %s status = %d\n", __func__, status);
-        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        return status;
-        return -NFS4ERR_STALE_CLIENTID;
 }
 #ifdef CONFIG_NFS_V4_1
@@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 {
        struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
        struct nfs_client *pos, *n, *prev = NULL;
-        int error;
+        int status = -NFS4ERR_STALE_CLIENTID;
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
                                nfs_put_client(prev);
                        prev = pos;
-                        error = nfs_wait_client_init_complete(pos);
+                        nfs4_schedule_lease_recovery(pos);
-                        if (error < 0) {
+                        status = nfs_wait_client_init_complete(pos);
+                        if (status < 0) {
                                nfs_put_client(pos);
                                spin_lock(&nn->nfs_client_lock);
                                continue;
                        }
+                        status = pos->cl_cons_state;
                        spin_lock(&nn->nfs_client_lock);
+                        if (status < 0)
+                                continue;
                }
                if (pos->rpc_ops != new->rpc_ops)
@@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (!nfs4_match_serverowners(pos, new))
                        continue;
+                atomic_inc(&pos->cl_count);
                spin_unlock(&nn->nfs_client_lock);
                dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
                        __func__, pos, atomic_read(&pos->cl_count));
@@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
                return 0;
        }
-        /*
+        /* No matching nfs_client found. */
-         * No matching nfs_client found.  This should be impossible,
-         * because the new nfs_client has already been added to
-         * nfs_client_list by nfs_get_client().
-         *
-         * Don't BUG(), since the caller is holding a mutex.
-         */
        spin_unlock(&nn->nfs_client_lock);
-        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        dprintk("NFS: <-- %s status = %d\n", __func__, status);
-        return -NFS4ERR_STALE_CLIENTID;
+        return status;
 }
 #endif  /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9448c579d41a..e61f68d5ef21 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
        clp->cl_confirm = clid.confirm;
        status = nfs40_walk_client_list(clp, result, cred);
-        switch (status) {
+        if (status == 0) {
-        case -NFS4ERR_STALE_CLIENTID:
-                set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-        case 0:
                /* Sustain the lease, even if it's empty.  If the clientid4
                 * goes stale it's of no use for trunking discovery. */
                nfs4_schedule_state_renewal(*result);
-                break;
        }
 out:
        return status;
 }
@@ -1863,6 +1858,7 @@ again:
        case -ETIMEDOUT:
        case -EAGAIN:
                ssleep(1);
+        case -NFS4ERR_STALE_CLIENTID:
                dprintk("NFS: %s after status %d, retrying\n",
                        __func__, status);
                goto again;
@@ -2022,8 +2018,18 @@ static int nfs4_reset_session(struct nfs_client *clp)
        nfs4_begin_drain_session(clp);
        cred = nfs4_get_exchange_id_cred(clp);
        status = nfs4_proc_destroy_session(clp->cl_session, cred);
-        if (status && status != -NFS4ERR_BADSESSION &&
+        switch (status) {
-            status != -NFS4ERR_DEADSESSION) {
+        case 0:
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_DEADSESSION:
+                break;
+        case -NFS4ERR_BACK_CHAN_BUSY:
+        case -NFS4ERR_DELAY:
+                set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+                status = 0;
+                ssleep(1);
+                goto out;
+        default:
                status = nfs4_recovery_handle_error(clp, status);
                goto out;
        }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2e7e8c878e5d..b056b1628722 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2589,27 +2589,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
        struct nfs_server *server;
        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
-        int error;
-        dprintk("--> nfs_xdev_mount_common()\n");
+        dprintk("--> nfs_xdev_mount()\n");
        mount_info.mntfh = mount_info.cloned->fh;
        /* create a new volume representation */
        server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
-        if (IS_ERR(server)) {
-                error = PTR_ERR(server);
-                goto out_err;
-        }
-        mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod);
+        if (IS_ERR(server))
-        dprintk("<-- nfs_xdev_mount_common() = 0\n");
+                mntroot = ERR_CAST(server);
-out:
+        else
-        return mntroot;
+                mntroot = nfs_fs_mount_common(server, flags,
+                                dev_name, &mount_info, nfs_mod);
-out_err:
+        dprintk("<-- nfs_xdev_mount() = %ld\n",
-        dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
+                        IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
-        goto out;
+        return mntroot;
 }
 #if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fdb180769485..f3859354e41a 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
        if (ret < 0)
                printk(KERN_ERR "NILFS: GC failed during preparation: "
                        "cannot read source blocks: err=%d\n", ret);
-        else
+        else {
+                if (nilfs_sb_need_update(nilfs))
+                        set_nilfs_discontinued(nilfs);
                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        }
        nilfs_remove_all_gcinodes(nilfs);
        clear_nilfs_gc_running(nilfs);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4111a40ebe1a..5f707e537171 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
        }
        if (ioend->io_iocb) {
+                inode_dio_done(ioend->io_inode);
                if (ioend->io_isasync) {
                        aio_complete(ioend->io_iocb, ioend->io_error ?
                                        ioend->io_error : ioend->io_result, 0);
                }
-                inode_dio_done(ioend->io_inode);
        }
        mempool_free(ioend, xfs_ioend_pool);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0e92d12765d2..cdb2d3348583 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4680,9 +4680,6 @@ __xfs_bmapi_allocate(
                        return error;
        }
-        if (bma->flags & XFS_BMAPI_STACK_SWITCH)
-                bma->stack_switch = 1;
        error = xfs_bmap_alloc(bma);
        if (error)
                return error;
@@ -4956,6 +4953,9 @@ xfs_bmapi_write(
        bma.flist = flist;
        bma.firstblock = firstblock;
+        if (flags & XFS_BMAPI_STACK_SWITCH)
+                bma.stack_switch = 1;
        while (bno < end && n < *nmap) {
                inhole = eof || bma.got.br_startoff > bno;
                wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 56d1614760cf..fbbb9eb92e32 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -487,6 +487,7 @@ _xfs_buf_find(
        struct rb_node          *parent;
        xfs_buf_t               *bp;
        xfs_daddr_t             blkno = map[0].bm_bn;
+        xfs_daddr_t             eofs;
        int                     numblks = 0;
        int                     i;
@@ -498,6 +499,23 @@ _xfs_buf_find(
        ASSERT(!(numbytes < (1 << btp->bt_sshift)));
        ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+        /*
+         * Corrupted block numbers can get through to here, unfortunately, so we
+         * have to check that the buffer falls within the filesystem bounds.
+         */
+        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+        if (blkno >= eofs) {
+                /*
+                 * XXX (dgc): we should really be returning EFSCORRUPTED here,
+                 * but none of the higher level infrastructure supports
+                 * returning a specific error on buffer lookup failures.
+                 */
+                xfs_alert(btp->bt_mount,
+                          "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+                          __func__, blkno, eofs);
+                return NULL;
+        }
        /* get tree root */
        pag = xfs_perag_get(btp->bt_mount,
                                xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -1487,6 +1505,8 @@ restart:
        while (!list_empty(&btp->bt_lru)) {
                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
                if (atomic_read(&bp->b_hold) > 1) {
+                        trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
                        goto restart;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 77b09750e92c..3f9949fee391 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -652,7 +652,10 @@ xfs_buf_item_unlock(
        /*
         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * reference we hold to it.
+         * reference we hold to it. If we are aborting the transaction, this may
+         * be the only reference to the buf item, so we free it anyway
+         * regardless of whether it is dirty or not. A dirty abort implies a
+         * shutdown, anyway.
         */
        clean = 1;
        for (i = 0; i < bip->bli_format_count; i++) {
@@ -664,7 +667,12 @@ xfs_buf_item_unlock(
        }
        if (clean)
                xfs_buf_item_relse(bp);
-        else
+        else if (aborted) {
+                if (atomic_dec_and_test(&bip->bli_refcount)) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                        xfs_buf_item_relse(bp);
+                }
+        } else
                atomic_dec(&bip->bli_refcount);
        if (!hold)
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d0e9c74d3d96..a8bd26b82ecb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,10 +246,10 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+        error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
        if (error)
                goto out_unlock;
-        truncate_pagecache_range(VFS_I(ip), 0, -1);
+        truncate_pagecache_range(VFS_I(tip), 0, -1);
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index add06b4e9a63..364818eef40e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -351,6 +351,15 @@ xfs_iomap_prealloc_size(
                }
                if (shift)
                        alloc_blocks >>= shift;
+                /*
+                 * If we are still trying to allocate more space than is
+                 * available, squash the prealloc hard. This can happen if we
+                 * have a large file on a small filesystem and the above
+                 * lowspace thresholds are smaller than MAXEXTLEN.
+                 */
+                while (alloc_blocks >= freesp)
+                        alloc_blocks >>= 4;
        }
        if (alloc_blocks < mp->m_writeio_blocks)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da508463ff10..7d6df7c00c36 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify(
                return;
        }
        /* quietly fail */
-        xfs_buf_ioerror(bp, EFSCORRUPTED);
+        xfs_buf_ioerror(bp, EWRONGFS);
 }
 static void
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2e137d4a85ae..16a812977eab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);