8 files changed, 107 insertions, 57 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..68f322f600a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3163,6 +3163,9 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
 int __init btrfs_init_cachep(void);
 void __cold btrfs_destroy_cachep(void);
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
+                              struct btrfs_root *root, int *new,
+                              struct btrfs_path *path);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *was_new);
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b0ab41da91d1..3f0b6d1936e8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1664,9 +1664,8 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_fs_info *fs_info = root->fs_info;
        int again;
-        struct btrfs_trans_handle *trans;
-        do {
+        while (1) {
                again = 0;
                /* Make the cleaner go to sleep early. */
@@ -1715,42 +1714,16 @@ static int cleaner_kthread(void *arg)
                 */
                btrfs_delete_unused_bgs(fs_info);
 sleep:
+                if (kthread_should_park())
+                        kthread_parkme();
+                if (kthread_should_stop())
+                        return 0;
                if (!again) {
                        set_current_state(TASK_INTERRUPTIBLE);
-                        if (!kthread_should_stop())
+                        schedule();
-                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
-        } while (!kthread_should_stop());
-        /*
-         * Transaction kthread is stopped before us and wakes us up.
-         * However we might have started a new transaction and COWed some
-         * tree blocks when deleting unused block groups for example. So
-         * make sure we commit the transaction we started to have a clean
-         * shutdown when evicting the btree inode - if it has dirty pages
-         * when we do the final iput() on it, eviction will trigger a
-         * writeback for it which will fail with null pointer dereferences
-         * since work queues and other resources were already released and
-         * destroyed by the time the iput/eviction/writeback is made.
-         */
-        trans = btrfs_attach_transaction(root);
-        if (IS_ERR(trans)) {
-                if (PTR_ERR(trans) != -ENOENT)
-                        btrfs_err(fs_info,
-                                  "cleaner transaction attach returned %ld",
-                                  PTR_ERR(trans));
-        } else {
-                int ret;
-                ret = btrfs_commit_transaction(trans);
-                if (ret)
-                        btrfs_err(fs_info,
-                                  "cleaner open transaction commit returned %d",
-                                  ret);
        }
-        return 0;
 }
 static int transaction_kthread(void *arg)
@@ -3931,6 +3904,13 @@ void close_ctree(struct btrfs_fs_info *fs_info)
        int ret;
        set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
+        /*
+         * We don't want the cleaner to start new transactions, add more delayed
+         * iputs, etc. while we're closing. We can't use kthread_stop() yet
+         * because that frees the task_struct, and the transaction kthread might
+         * still try to wake up the cleaner.
+         */
+        kthread_park(fs_info->cleaner_kthread);
        /* wait for the qgroup rescan worker to stop */
        btrfs_qgroup_wait_for_completion(fs_info, false);
@@ -3958,9 +3938,8 @@ void close_ctree(struct btrfs_fs_info *fs_info)
        if (!sb_rdonly(fs_info->sb)) {
                /*
-                 * If the cleaner thread is stopped and there are
+                 * The cleaner kthread is stopped, so do one final pass over
-                 * block groups queued for removal, the deletion will be
+                 * unused block groups.
-                 * skipped when we quit the cleaner thread.
                 */
                btrfs_delete_unused_bgs(fs_info);
@@ -4359,13 +4338,23 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
        unpin = pinned_extents;
 again:
        while (1) {
+                /*
+                 * The btrfs_finish_extent_commit() may get the same range as
+                 * ours between find_first_extent_bit and clear_extent_dirty.
+                 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
+                 * the same extent range.
+                 */
+                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY, NULL);
-                if (ret)
+                if (ret) {
+                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
+                }
                clear_extent_dirty(unpin, start, end);
                btrfs_error_unpin_extent_range(fs_info, start, end);
+                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
        }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4ba0aedc878b..74aa552f4793 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -75,7 +75,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
         * sure NOFS is set to keep us from deadlocking.
         */
        nofs_flag = memalloc_nofs_save();
-        inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+        inode = btrfs_iget_path(fs_info->sb, &location, root, NULL, path);
+        btrfs_release_path(path);
        memalloc_nofs_restore(nofs_flag);
        if (IS_ERR(inode))
                return inode;
@@ -838,6 +839,25 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
        path->search_commit_root = 1;
        path->skip_locking = 1;
+        /*
+         * We must pass a path with search_commit_root set to btrfs_iget in
+         * order to avoid a deadlock when allocating extents for the tree root.
+         *
+         * When we are COWing an extent buffer from the tree root, when looking
+         * for a free extent, at extent-tree.c:find_free_extent(), we can find
+         * block group without its free space cache loaded. When we find one
+         * we must load its space cache which requires reading its free space
+         * cache's inode item from the root tree. If this inode item is located
+         * in the same leaf that we started COWing before, then we end up in
+         * deadlock on the extent buffer (trying to read lock it when we
+         * previously write locked it).
+         *
+         * It's safe to read the inode item using the commit root because
+         * block groups, once loaded, stay in memory forever (until they are
+         * removed) as well as their space caches once loaded. New block groups
+         * once created get their ->cached field set to BTRFS_CACHE_FINISHED so
+         * we will never try to read their inode item while the fs is mounted.
+         */
        inode = lookup_free_space_inode(fs_info, block_group, path);
        if (IS_ERR(inode)) {
                btrfs_free_path(path);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d3df5b52278c..9ea4c6f0352f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1531,12 +1531,11 @@ out_check:
        }
        btrfs_release_path(path);
-        if (cur_offset <= end && cow_start == (u64)-1) {
+        if (cur_offset <= end && cow_start == (u64)-1)
                cow_start = cur_offset;
-                cur_offset = end;
-        }
        if (cow_start != (u64)-1) {
+                cur_offset = end;
                ret = cow_file_range(inode, locked_page, cow_start, end, end,
                                     page_started, nr_written, 1, NULL);
                if (ret)
@@ -3570,10 +3569,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
 * read an inode from the btree into the in-memory inode
 */
-static int btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode,
+                                   struct btrfs_path *in_path)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-        struct btrfs_path *path;
+        struct btrfs_path *path = in_path;
        struct extent_buffer *leaf;
        struct btrfs_inode_item *inode_item;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3589,15 +3589,18 @@ static int btrfs_read_locked_inode(struct inode *inode)
        if (!ret)
                filled = true;
-        path = btrfs_alloc_path();
+        if (!path) {
-        if (!path)
+                path = btrfs_alloc_path();
-                return -ENOMEM;
+                if (!path)
+                        return -ENOMEM;
+        }
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
        if (ret) {
-                btrfs_free_path(path);
+                if (path != in_path)
+                        btrfs_free_path(path);
                return ret;
        }
@@ -3722,7 +3725,8 @@ cache_acl:
                                  btrfs_ino(BTRFS_I(inode)),
                                  root->root_key.objectid, ret);
        }
-        btrfs_free_path(path);
+        if (path != in_path)
+                btrfs_free_path(path);
        if (!maybe_acls)
                cache_no_acl(inode);
@@ -5644,8 +5648,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
 /* Get an inode object given its location and corresponding root.
 * Returns in *is_new if the inode was read from disk
 */
-struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
-                         struct btrfs_root *root, int *new)
+                              struct btrfs_root *root, int *new,
+                              struct btrfs_path *path)
 {
        struct inode *inode;
@@ -5656,7 +5661,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
        if (inode->i_state & I_NEW) {
                int ret;
-                ret = btrfs_read_locked_inode(inode);
+                ret = btrfs_read_locked_inode(inode, path);
                if (!ret) {
                        inode_tree_add(inode);
                        unlock_new_inode(inode);
@@ -5678,6 +5683,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
        return inode;
 }
+struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
+                         struct btrfs_root *root, int *new)
+{
+        return btrfs_iget_path(s, location, root, new, NULL);
+}
 static struct inode *new_simple_dir(struct super_block *s,
                                    struct btrfs_key *key,
                                    struct btrfs_root *root)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3ca6943827ef..802a628e9f7d 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3488,6 +3488,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
                        const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
                        len = round_down(i_size_read(src), sz) - loff;
+                        if (len == 0)
+                                return 0;
                        olen = len;
                }
        }
@@ -4257,9 +4259,17 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
                goto out_unlock;
        if (len == 0)
                olen = len = src->i_size - off;
-        /* if we extend to eof, continue to block boundary */
+        /*
-        if (off + len == src->i_size)
+         * If we extend to eof, continue to block boundary if and only if the
+         * destination end offset matches the destination file's size, otherwise
+         * we would be corrupting data by placing the eof block into the middle
+         * of a file.
+         */
+        if (off + len == src->i_size) {
+                if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
+                        goto out_unlock;
                len = ALIGN(src->i_size, bs) - off;
+        }
        if (len == 0) {
                ret = 0;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b362b45dd757..cbc9d0d2c12d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1916,7 +1916,7 @@ restore:
 }
 /* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
                                       const void *dev_info2)
 {
        if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1945,8 +1945,8 @@ static inline void btrfs_descending_sort_devices(
 * The helper to calc the free space on the devices that can be used to store
 * file data.
 */
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
-                                       u64 *free_bytes)
+                                              u64 *free_bytes)
 {
        struct btrfs_device_info *devices_info;
        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index cab0b1f1f741..efcf89a8ba44 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info *fs_info,
            type != (BTRFS_BLOCK_GROUP_METADATA |
                           BTRFS_BLOCK_GROUP_DATA)) {
                block_group_err(fs_info, leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
                        type, hweight64(type),
                        BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
                        BTRFS_BLOCK_GROUP_SYSTEM,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e07f3376b7df..a5ce99a6c936 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4396,6 +4396,23 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        logged_end = end;
        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+                /*
+                 * Skip extents outside our logging range. It's important to do
+                 * it for correctness because if we don't ignore them, we may
+                 * log them before their ordered extent completes, and therefore
+                 * we could log them without logging their respective checksums
+                 * (the checksum items are added to the csum tree at the very
+                 * end of btrfs_finish_ordered_io()). Also leave such extents
+                 * outside of our range in the list, since we may have another
+                 * ranged fsync in the near future that needs them. If an extent
+                 * outside our range corresponds to a hole, log it to avoid
+                 * leaving gaps between extents (fsck will complain when we are
+                 * not using the NO_HOLES feature).
+                 */
+                if ((em->start > end || em->start + em->len <= start) &&
+                    em->block_start != EXTENT_MAP_HOLE)
+                        continue;
                list_del_init(&em->list);
                /*
                 * Just an arbitrary number, this can be really CPU intensive