Merge branch 'for-chris-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/fdmanana/linux into for-linus-4.5

author: Chris Mason <clm@fb.com> 2015-12-23 16:28:35 -0500
committer: Chris Mason <clm@fb.com> 2015-12-23 16:28:35 -0500
commit: a53fe2576955171449711933242d8fb1c13a7d5c (patch)
tree: 6fd4a583f2e9606c6c61226b40a060e4c1b2dc60
parent: bb9d687618695e8291f1e6209eb3211d231f97bb (diff)
parent: e44081ef611832b47a86abf4e36dc0ed2e950884 (diff)
5 files changed, 151 insertions, 56 deletions
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 065055342881..d436117e20dd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        /*
-         * We don't need the lock here since we are protected by the transaction
+         * Even though we are in the critical section of the transaction commit,
-         * commit.  We want to do the cache_save_setup first and then run the
+         * we can still have concurrent tasks adding elements to this
+         * transaction's list of dirty block groups. These tasks correspond to
+         * endio free space workers started when writeback finishes for a
+         * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+         * allocate new block groups as a result of COWing nodes of the root
+         * tree when updating the free space inode. The writeback for the space
+         * caches is triggered by an earlier call to
+         * btrfs_start_dirty_block_groups() and iterations of the following
+         * loop.
+         * Also we want to do the cache_save_setup first and then run the
         * delayed refs to make sure we have the best chance at doing this all
         * in one shot.
         */
+        spin_lock(&cur_trans->dirty_bgs_lock);
        while (!list_empty(&cur_trans->dirty_bgs)) {
                cache = list_first_entry(&cur_trans->dirty_bgs,
                                         struct btrfs_block_group_cache,
@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 * finish and then do it all again
                 */
                if (!list_empty(&cache->io_list)) {
+                        spin_unlock(&cur_trans->dirty_bgs_lock);
                        list_del_init(&cache->io_list);
                        btrfs_wait_cache_io(root, trans, cache,
                                            &cache->io_ctl, path,
                                            cache->key.objectid);
                        btrfs_put_block_group(cache);
+                        spin_lock(&cur_trans->dirty_bgs_lock);
                }
                /*
@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                 * on any pending IO
                 */
                list_del_init(&cache->dirty_list);
+                spin_unlock(&cur_trans->dirty_bgs_lock);
                should_put = 1;
                cache_save_setup(cache, trans, path);
@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                /* if its not on the io list, we need to put the block group */
                if (should_put)
                        btrfs_put_block_group(cache);
+                spin_lock(&cur_trans->dirty_bgs_lock);
        }
+        spin_unlock(&cur_trans->dirty_bgs_lock);
        while (!list_empty(io)) {
                cache = list_first_entry(io, struct btrfs_block_group_cache,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index dc616329c9a1..bdb0008712c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
        struct btrfs_root *root;
 };
+struct btrfs_dio_data {
+        u64 outstanding_extents;
+        u64 reserve;
+        u64 unsubmitted_oe_range_start;
+        u64 unsubmitted_oe_range_end;
+};
 static const struct inode_operations btrfs_dir_inode_operations;
 static const struct inode_operations btrfs_symlink_inode_operations;
 static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -7408,25 +7415,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
                        btrfs_start_ordered_extent(inode, ordered, 1);
                        btrfs_put_ordered_extent(ordered);
                } else {
-                        /* Screw you mmap */
-                        ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
-                        if (ret)
-                                break;
-                        ret = filemap_fdatawait_range(inode->i_mapping,
-                                                      lockstart,
-                                                      lockend);
-                        if (ret)
-                                break;
                        /*
-                         * If we found a page that couldn't be invalidated just
+                         * We could trigger writeback for this range (and wait
-                         * fall back to buffered.
+                         * for it to complete) and then invalidate the pages for
+                         * this range (through invalidate_inode_pages2_range()),
+                         * but that can lead us to a deadlock with a concurrent
+                         * call to readpages() (a buffered read or a defrag call
+                         * triggered a readahead) on a page lock due to an
+                         * ordered dio extent we created before but did not have
+                         * yet a corresponding bio submitted (whence it can not
+                         * complete), which makes readpages() wait for that
+                         * ordered extent to complete while holding a lock on
+                         * that page.
                         */
-                        ret = invalidate_inode_pages2_range(inode->i_mapping,
+                        ret = -ENOTBLK;
-                                        lockstart >> PAGE_CACHE_SHIFT,
+                        break;
-                                        lockend >> PAGE_CACHE_SHIFT);
-                        if (ret)
-                                break;
                }
                cond_resched();
@@ -7482,11 +7485,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
        return em;
 }
-struct btrfs_dio_data {
-        u64 outstanding_extents;
-        u64 reserve;
-};
 static void adjust_dio_outstanding_extents(struct inode *inode,
                                           struct btrfs_dio_data *dio_data,
                                           const u64 len)
@@ -7670,6 +7668,7 @@ unlock:
                btrfs_free_reserved_data_space(inode, start, len);
                WARN_ON(dio_data->reserve < len);
                dio_data->reserve -= len;
+                dio_data->unsubmitted_oe_range_end = start + len;
                current->journal_info = dio_data;
        }
@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
        bio_put(bio);
 }
-static void btrfs_endio_direct_write(struct bio *bio)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+                                                    const u64 offset,
+                                                    const u64 bytes,
+                                                    const int uptodate)
 {
-        struct btrfs_dio_private *dip = bio->bi_private;
-        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_ordered_extent *ordered = NULL;
-        u64 ordered_offset = dip->logical_offset;
+        u64 ordered_offset = offset;
-        u64 ordered_bytes = dip->bytes;
+        u64 ordered_bytes = bytes;
-        struct bio *dio_bio;
        int ret;
 again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
                                                   ordered_bytes,
-                                                   !bio->bi_error);
+                                                   uptodate);
        if (!ret)
                goto out_test;
@@ -8020,13 +8019,22 @@ out_test:
         * our bio might span multiple ordered extents.  If we haven't
         * completed the accounting for the whole dio, go back and try again
         */
-        if (ordered_offset < dip->logical_offset + dip->bytes) {
+        if (ordered_offset < offset + bytes) {
-                ordered_bytes = dip->logical_offset + dip->bytes -
+                ordered_bytes = offset + bytes - ordered_offset;
-                        ordered_offset;
                ordered = NULL;
                goto again;
        }
-        dio_bio = dip->dio_bio;
+}
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct bio *dio_bio = dip->dio_bio;
+        btrfs_endio_direct_write_update_ordered(dip->inode,
+                                                dip->logical_offset,
+                                                dip->bytes,
+                                                !bio->bi_error);
        kfree(dip);
@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                dip->subio_endio = btrfs_subio_endio_read;
        }
+        /*
+         * Reset the range for unsubmitted ordered extents (to a 0 length range)
+         * even if we fail to submit a bio, because in such case we do the
+         * corresponding error handling below and it must not be done a second
+         * time by btrfs_direct_IO().
+         */
+        if (write) {
+                struct btrfs_dio_data *dio_data = current->journal_info;
+                dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+                        dip->bytes;
+                dio_data->unsubmitted_oe_range_start =
+                        dio_data->unsubmitted_oe_range_end;
+        }
        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
        if (!ret)
                return;
@@ -8362,24 +8385,15 @@ free_ordered:
                dip = NULL;
                io_bio = NULL;
        } else {
-                if (write) {
+                if (write)
-                        struct btrfs_ordered_extent *ordered;
+                        btrfs_endio_direct_write_update_ordered(inode,
+                                                file_offset,
-                        ordered = btrfs_lookup_ordered_extent(inode,
+                                                dio_bio->bi_iter.bi_size,
-                                                              file_offset);
+                                                0);
-                        set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+                else
-                        /*
-                         * Decrements our ref on the ordered extent and removes
-                         * the ordered extent from the inode's ordered tree,
-                         * doing all the proper resource cleanup such as for the
-                         * reserved space and waking up any waiters for this
-                         * ordered extent (through btrfs_remove_ordered_extent).
-                         */
-                        btrfs_finish_ordered_io(ordered);
-                } else {
                        unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
                              file_offset + dio_bio->bi_iter.bi_size - 1);
-                }
                dio_bio->bi_error = -EIO;
                /*
                 * Releases and cleans up our dio_bio, no need to bio_put()
@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 * originally calculated.  Abuse current->journal_info for this.
                 */
                dio_data.reserve = round_up(count, root->sectorsize);
+                dio_data.unsubmitted_oe_range_start = (u64)offset;
+                dio_data.unsubmitted_oe_range_end = (u64)offset;
                current->journal_info = &dio_data;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        if (dio_data.reserve)
                                btrfs_delalloc_release_space(inode, offset,
                                                             dio_data.reserve);
+                        /*
+                         * On error we might have left some ordered extents
+                         * without submitting corresponding bios for them, so
+                         * cleanup them up to avoid other tasks getting them
+                         * and waiting for them to complete forever.
+                         */
+                        if (dio_data.unsubmitted_oe_range_start <
+                            dio_data.unsubmitted_oe_range_end)
+                                btrfs_endio_direct_write_update_ordered(inode,
+                                        dio_data.unsubmitted_oe_range_start,
+                                        dio_data.unsubmitted_oe_range_end -
+                                        dio_data.unsubmitted_oe_range_start,
+                                        0);
                } else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index be8eae80ff65..f85ccf634ca1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
                        list_del_init(&em->list);
                        free_extent_map(em);
                }
+                /*
+                 * If any block groups are found in ->deleted_bgs then it's
+                 * because the transaction was aborted and a commit did not
+                 * happen (things failed before writing the new superblock
+                 * and calling btrfs_finish_extent_commit()), so we can not
+                 * discard the physical locations of the block groups.
+                 */
+                while (!list_empty(&transaction->deleted_bgs)) {
+                        struct btrfs_block_group_cache *cache;
+                        cache = list_first_entry(&transaction->deleted_bgs,
+                                                 struct btrfs_block_group_cache,
+                                                 bg_list);
+                        list_del_init(&cache->bg_list);
+                        btrfs_put_block_group_trimming(cache);
+                        btrfs_put_block_group(cache);
+                }
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
 }
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325339..cb65089127cc 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                goto out;
        }
        btrfs_release_path(path);
+        /*
+         * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+         * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+         * a deadlock (attempting to write lock an already write locked leaf).
+         */
+        path->lowest_level = 1;
        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = 0;
                goto out;
        }
-        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+        /*
-        next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+         * The node at level 1 must always be locked when our path has
-                                           min_trans);
+         * keep_locks set and lowest_level is 1, regardless of the value of
+         * path->slots[1].
+         */
+        BUG_ON(path->locks[1] == 0);
        ret = btrfs_realloc_node(trans, root,
                                 path->nodes[1], 0,
                                 &last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                WARN_ON(ret == -EAGAIN);
                goto out;
        }
+        /*
+         * Now that we reallocated the node we can find the next key. Note that
+         * btrfs_find_next_key() can release our path and do another search
+         * without COWing, this is because even with path->keep_locks = 1,
+         * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+         * node when path->slots[node_level - 1] does not point to the last
+         * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+         * we search for the next key after reallocating our node.
+         */
+        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+        next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+                                           min_trans);
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 54d2d2cc2c92..a37cc0478bb2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4825,20 +4825,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
+        /*
+         * Take the device list mutex to prevent races with the final phase of
+         * a device replace operation that replaces the device object associated
+         * with the map's stripes, because the device object's id can change
+         * at any time during that final phase of the device replace operation
+         * (dev-replace.c:btrfs_dev_replace_finishing()).
+         */
+        mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
        for (i = 0; i < map->num_stripes; i++) {
                device = map->stripes[i].dev;
                dev_offset = map->stripes[i].physical;
                ret = btrfs_update_device(trans, device);
                if (ret)
-                        goto out;
+                        break;
                ret = btrfs_alloc_dev_extent(trans, device,
                                             chunk_root->root_key.objectid,
                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                             chunk_offset, dev_offset,
                                             stripe_size);
                if (ret)
-                        goto out;
+                        break;
+        }
+        if (ret) {
+                mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+                goto out;
        }
        stripe = &chunk->stripe;
@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
                stripe++;
        }
+        mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
        btrfs_set_stack_chunk_length(chunk, chunk_size);
        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
author	Chris Mason <clm@fb.com>	2015-12-23 16:28:35 -0500
committer	Chris Mason <clm@fb.com>	2015-12-23 16:28:35 -0500
commit	a53fe2576955171449711933242d8fb1c13a7d5c (patch)
tree	6fd4a583f2e9606c6c61226b40a060e4c1b2dc60
parent	bb9d687618695e8291f1e6209eb3211d231f97bb (diff)
parent	e44081ef611832b47a86abf4e36dc0ed2e950884 (diff)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 065055342881..d436117e20dd 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c
@@ -3684,11 +3684,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3684	return -ENOMEM;	3684	return -ENOMEM;
3685		3685
3686	/*	3686	/*
3687	* We don't need the lock here since we are protected by the transaction	3687	* Even though we are in the critical section of the transaction commit,
3688	* commit. We want to do the cache_save_setup first and then run the	3688	* we can still have concurrent tasks adding elements to this
		3689	* transaction's list of dirty block groups. These tasks correspond to
		3690	* endio free space workers started when writeback finishes for a
		3691	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
		3692	* allocate new block groups as a result of COWing nodes of the root
		3693	* tree when updating the free space inode. The writeback for the space
		3694	* caches is triggered by an earlier call to
		3695	* btrfs_start_dirty_block_groups() and iterations of the following
		3696	* loop.
		3697	* Also we want to do the cache_save_setup first and then run the
3689	* delayed refs to make sure we have the best chance at doing this all	3698	* delayed refs to make sure we have the best chance at doing this all
3690	* in one shot.	3699	* in one shot.
3691	*/	3700	*/
		3701	spin_lock(&cur_trans->dirty_bgs_lock);
3692	while (!list_empty(&cur_trans->dirty_bgs)) {	3702	while (!list_empty(&cur_trans->dirty_bgs)) {
3693	cache = list_first_entry(&cur_trans->dirty_bgs,	3703	cache = list_first_entry(&cur_trans->dirty_bgs,
3694	struct btrfs_block_group_cache,	3704	struct btrfs_block_group_cache,
@@ -3700,11 +3710,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3700	* finish and then do it all again	3710	* finish and then do it all again
3701	*/	3711	*/
3702	if (!list_empty(&cache->io_list)) {	3712	if (!list_empty(&cache->io_list)) {
		3713	spin_unlock(&cur_trans->dirty_bgs_lock);
3703	list_del_init(&cache->io_list);	3714	list_del_init(&cache->io_list);
3704	btrfs_wait_cache_io(root, trans, cache,	3715	btrfs_wait_cache_io(root, trans, cache,
3705	&cache->io_ctl, path,	3716	&cache->io_ctl, path,
3706	cache->key.objectid);	3717	cache->key.objectid);
3707	btrfs_put_block_group(cache);	3718	btrfs_put_block_group(cache);
		3719	spin_lock(&cur_trans->dirty_bgs_lock);
3708	}	3720	}
3709		3721
3710	/*	3722	/*
@@ -3712,6 +3724,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3712	* on any pending IO	3724	* on any pending IO
3713	*/	3725	*/
3714	list_del_init(&cache->dirty_list);	3726	list_del_init(&cache->dirty_list);
		3727	spin_unlock(&cur_trans->dirty_bgs_lock);
3715	should_put = 1;	3728	should_put = 1;
3716		3729
3717	cache_save_setup(cache, trans, path);	3730	cache_save_setup(cache, trans, path);
@@ -3743,7 +3756,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3743	/* if its not on the io list, we need to put the block group */	3756	/* if its not on the io list, we need to put the block group */
3744	if (should_put)	3757	if (should_put)
3745	btrfs_put_block_group(cache);	3758	btrfs_put_block_group(cache);
		3759	spin_lock(&cur_trans->dirty_bgs_lock);
3746	}	3760	}
		3761	spin_unlock(&cur_trans->dirty_bgs_lock);
3747		3762
3748	while (!list_empty(io)) {	3763	while (!list_empty(io)) {
3749	cache = list_first_entry(io, struct btrfs_block_group_cache,	3764	cache = list_first_entry(io, struct btrfs_block_group_cache,


diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc616329c9a1..bdb0008712c8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
66	struct btrfs_root *root;	66	struct btrfs_root *root;
67	};	67	};
68		68
		69	struct btrfs_dio_data {
		70	u64 outstanding_extents;
		71	u64 reserve;
		72	u64 unsubmitted_oe_range_start;
		73	u64 unsubmitted_oe_range_end;
		74	};
		75
69	static const struct inode_operations btrfs_dir_inode_operations;	76	static const struct inode_operations btrfs_dir_inode_operations;
70	static const struct inode_operations btrfs_symlink_inode_operations;	77	static const struct inode_operations btrfs_symlink_inode_operations;
71	static const struct inode_operations btrfs_dir_ro_inode_operations;	78	static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -7408,25 +7415,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7408	btrfs_start_ordered_extent(inode, ordered, 1);	7415	btrfs_start_ordered_extent(inode, ordered, 1);
7409	btrfs_put_ordered_extent(ordered);	7416	btrfs_put_ordered_extent(ordered);
7410	} else {	7417	} else {
7411	/* Screw you mmap */
7412	ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
7413	if (ret)
7414	break;
7415	ret = filemap_fdatawait_range(inode->i_mapping,
7416	lockstart,
7417	lockend);
7418	if (ret)
7419	break;
7420
7421	/*	7418	/*
7422	* If we found a page that couldn't be invalidated just	7419	* We could trigger writeback for this range (and wait
7423	* fall back to buffered.	7420	* for it to complete) and then invalidate the pages for
		7421	* this range (through invalidate_inode_pages2_range()),
		7422	* but that can lead us to a deadlock with a concurrent
		7423	* call to readpages() (a buffered read or a defrag call
		7424	* triggered a readahead) on a page lock due to an
		7425	* ordered dio extent we created before but did not have
		7426	* yet a corresponding bio submitted (whence it can not
		7427	* complete), which makes readpages() wait for that
		7428	* ordered extent to complete while holding a lock on
		7429	* that page.
7424	*/	7430	*/
7425	ret = invalidate_inode_pages2_range(inode->i_mapping,	7431	ret = -ENOTBLK;
7426	lockstart >> PAGE_CACHE_SHIFT,	7432	break;
7427	lockend >> PAGE_CACHE_SHIFT);
7428	if (ret)
7429	break;
7430	}	7433	}
7431		7434
7432	cond_resched();	7435	cond_resched();
@@ -7482,11 +7485,6 @@ static struct extent_map create_pinned_em(struct inode inode, u64 start,
7482	return em;	7485	return em;
7483	}	7486	}
7484		7487
7485	struct btrfs_dio_data {
7486	u64 outstanding_extents;
7487	u64 reserve;
7488	};
7489
7490	static void adjust_dio_outstanding_extents(struct inode *inode,	7488	static void adjust_dio_outstanding_extents(struct inode *inode,
7491	struct btrfs_dio_data *dio_data,	7489	struct btrfs_dio_data *dio_data,
7492	const u64 len)	7490	const u64 len)
@@ -7670,6 +7668,7 @@ unlock:
7670	btrfs_free_reserved_data_space(inode, start, len);	7668	btrfs_free_reserved_data_space(inode, start, len);
7671	WARN_ON(dio_data->reserve < len);	7669	WARN_ON(dio_data->reserve < len);
7672	dio_data->reserve -= len;	7670	dio_data->reserve -= len;
		7671	dio_data->unsubmitted_oe_range_end = start + len;
7673	current->journal_info = dio_data;	7672	current->journal_info = dio_data;
7674	}	7673	}
7675		7674
@@ -7992,22 +7991,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
7992	bio_put(bio);	7991	bio_put(bio);
7993	}	7992	}
7994		7993
7995	static void btrfs_endio_direct_write(struct bio *bio)	7994	static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
		7995	const u64 offset,
		7996	const u64 bytes,
		7997	const int uptodate)
7996	{	7998	{
7997	struct btrfs_dio_private *dip = bio->bi_private;
7998	struct inode *inode = dip->inode;
7999	struct btrfs_root *root = BTRFS_I(inode)->root;	7999	struct btrfs_root *root = BTRFS_I(inode)->root;
8000	struct btrfs_ordered_extent *ordered = NULL;	8000	struct btrfs_ordered_extent *ordered = NULL;
8001	u64 ordered_offset = dip->logical_offset;	8001	u64 ordered_offset = offset;
8002	u64 ordered_bytes = dip->bytes;	8002	u64 ordered_bytes = bytes;
8003	struct bio *dio_bio;
8004	int ret;	8003	int ret;
8005		8004
8006	again:	8005	again:
8007	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,	8006	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8008	&ordered_offset,	8007	&ordered_offset,
8009	ordered_bytes,	8008	ordered_bytes,
8010	!bio->bi_error);	8009	uptodate);
8011	if (!ret)	8010	if (!ret)
8012	goto out_test;	8011	goto out_test;
8013		8012
@@ -8020,13 +8019,22 @@ out_test:
8020	* our bio might span multiple ordered extents. If we haven't	8019	* our bio might span multiple ordered extents. If we haven't
8021	* completed the accounting for the whole dio, go back and try again	8020	* completed the accounting for the whole dio, go back and try again
8022	*/	8021	*/
8023	if (ordered_offset < dip->logical_offset + dip->bytes) {	8022	if (ordered_offset < offset + bytes) {
8024	ordered_bytes = dip->logical_offset + dip->bytes -	8023	ordered_bytes = offset + bytes - ordered_offset;
8025	ordered_offset;
8026	ordered = NULL;	8024	ordered = NULL;
8027	goto again;	8025	goto again;
8028	}	8026	}
8029	dio_bio = dip->dio_bio;	8027	}
		8028
		8029	static void btrfs_endio_direct_write(struct bio *bio)
		8030	{
		8031	struct btrfs_dio_private *dip = bio->bi_private;
		8032	struct bio *dio_bio = dip->dio_bio;
		8033
		8034	btrfs_endio_direct_write_update_ordered(dip->inode,
		8035	dip->logical_offset,
		8036	dip->bytes,
		8037	!bio->bi_error);
8030		8038
8031	kfree(dip);	8039	kfree(dip);
8032		8040
@@ -8334,6 +8342,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
8334	dip->subio_endio = btrfs_subio_endio_read;	8342	dip->subio_endio = btrfs_subio_endio_read;
8335	}	8343	}
8336		8344
		8345	/*
		8346	* Reset the range for unsubmitted ordered extents (to a 0 length range)
		8347	* even if we fail to submit a bio, because in such case we do the
		8348	* corresponding error handling below and it must not be done a second
		8349	* time by btrfs_direct_IO().
		8350	*/
		8351	if (write) {
		8352	struct btrfs_dio_data *dio_data = current->journal_info;
		8353
		8354	dio_data->unsubmitted_oe_range_end = dip->logical_offset +
		8355	dip->bytes;
		8356	dio_data->unsubmitted_oe_range_start =
		8357	dio_data->unsubmitted_oe_range_end;
		8358	}
		8359
8337	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);	8360	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
8338	if (!ret)	8361	if (!ret)
8339	return;	8362	return;
@@ -8362,24 +8385,15 @@ free_ordered:
8362	dip = NULL;	8385	dip = NULL;
8363	io_bio = NULL;	8386	io_bio = NULL;
8364	} else {	8387	} else {
8365	if (write) {	8388	if (write)
8366	struct btrfs_ordered_extent *ordered;	8389	btrfs_endio_direct_write_update_ordered(inode,
8367		8390	file_offset,
8368	ordered = btrfs_lookup_ordered_extent(inode,	8391	dio_bio->bi_iter.bi_size,
8369	file_offset);	8392	0);
8370	set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);	8393	else
8371	/*
8372	* Decrements our ref on the ordered extent and removes
8373	* the ordered extent from the inode's ordered tree,
8374	* doing all the proper resource cleanup such as for the
8375	* reserved space and waking up any waiters for this
8376	* ordered extent (through btrfs_remove_ordered_extent).
8377	*/
8378	btrfs_finish_ordered_io(ordered);
8379	} else {
8380	unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,	8394	unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8381	file_offset + dio_bio->bi_iter.bi_size - 1);	8395	file_offset + dio_bio->bi_iter.bi_size - 1);
8382	}	8396
8383	dio_bio->bi_error = -EIO;	8397	dio_bio->bi_error = -EIO;
8384	/*	8398	/*
8385	* Releases and cleans up our dio_bio, no need to bio_put()	8399	* Releases and cleans up our dio_bio, no need to bio_put()
@@ -8479,6 +8493,8 @@ static ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter,
8479	* originally calculated. Abuse current->journal_info for this.	8493	* originally calculated. Abuse current->journal_info for this.
8480	*/	8494	*/
8481	dio_data.reserve = round_up(count, root->sectorsize);	8495	dio_data.reserve = round_up(count, root->sectorsize);
		8496	dio_data.unsubmitted_oe_range_start = (u64)offset;
		8497	dio_data.unsubmitted_oe_range_end = (u64)offset;
8482	current->journal_info = &dio_data;	8498	current->journal_info = &dio_data;
8483	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,	8499	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8484	&BTRFS_I(inode)->runtime_flags)) {	8500	&BTRFS_I(inode)->runtime_flags)) {
@@ -8497,6 +8513,19 @@ static ssize_t btrfs_direct_IO(struct kiocb iocb, struct iov_iter iter,
8497	if (dio_data.reserve)	8513	if (dio_data.reserve)
8498	btrfs_delalloc_release_space(inode, offset,	8514	btrfs_delalloc_release_space(inode, offset,
8499	dio_data.reserve);	8515	dio_data.reserve);
		8516	/*
		8517	* On error we might have left some ordered extents
		8518	* without submitting corresponding bios for them, so
		8519	* cleanup them up to avoid other tasks getting them
		8520	* and waiting for them to complete forever.
		8521	*/
		8522	if (dio_data.unsubmitted_oe_range_start <
		8523	dio_data.unsubmitted_oe_range_end)
		8524	btrfs_endio_direct_write_update_ordered(inode,
		8525	dio_data.unsubmitted_oe_range_start,
		8526	dio_data.unsubmitted_oe_range_end -
		8527	dio_data.unsubmitted_oe_range_start,
		8528	0);
8500	} else if (ret >= 0 && (size_t)ret < count)	8529	} else if (ret >= 0 && (size_t)ret < count)
8501	btrfs_delalloc_release_space(inode, offset,	8530	btrfs_delalloc_release_space(inode, offset,
8502	count - (size_t)ret);	8531	count - (size_t)ret);


diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index be8eae80ff65..f85ccf634ca1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
75	list_del_init(&em->list);	75	list_del_init(&em->list);
76	free_extent_map(em);	76	free_extent_map(em);
77	}	77	}
		78	/*
		79	* If any block groups are found in ->deleted_bgs then it's
		80	* because the transaction was aborted and a commit did not
		81	* happen (things failed before writing the new superblock
		82	* and calling btrfs_finish_extent_commit()), so we can not
		83	* discard the physical locations of the block groups.
		84	*/
		85	while (!list_empty(&transaction->deleted_bgs)) {
		86	struct btrfs_block_group_cache *cache;
		87
		88	cache = list_first_entry(&transaction->deleted_bgs,
		89	struct btrfs_block_group_cache,
		90	bg_list);
		91	list_del_init(&cache->bg_list);
		92	btrfs_put_block_group_trimming(cache);
		93	btrfs_put_block_group(cache);
		94	}
78	kmem_cache_free(btrfs_transaction_cachep, transaction);	95	kmem_cache_free(btrfs_transaction_cachep, transaction);
79	}	96	}
80	}	97	}


diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f31db4325339..cb65089127cc 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
89	goto out;	89	goto out;
90	}	90	}
91	btrfs_release_path(path);	91	btrfs_release_path(path);
		92	/*
		93	* We don't need a lock on a leaf. btrfs_realloc_node() will lock all
		94	* leafs from path->nodes[1], so set lowest_level to 1 to avoid later
		95	* a deadlock (attempting to write lock an already write locked leaf).
		96	*/
		97	path->lowest_level = 1;
92	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);	98	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
93		99
94	if (wret < 0) {	100	if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
99	ret = 0;	105	ret = 0;
100	goto out;	106	goto out;
101	}	107	}
102	path->slots[1] = btrfs_header_nritems(path->nodes[1]);	108	/*
103	next_key_ret = btrfs_find_next_key(root, path, &key, 1,	109	* The node at level 1 must always be locked when our path has
104	min_trans);	110	* keep_locks set and lowest_level is 1, regardless of the value of
		111	* path->slots[1].
		112	*/
		113	BUG_ON(path->locks[1] == 0);
105	ret = btrfs_realloc_node(trans, root,	114	ret = btrfs_realloc_node(trans, root,
106	path->nodes[1], 0,	115	path->nodes[1], 0,
107	&last_ret,	116	&last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
110	WARN_ON(ret == -EAGAIN);	119	WARN_ON(ret == -EAGAIN);
111	goto out;	120	goto out;
112	}	121	}
		122	/*
		123	* Now that we reallocated the node we can find the next key. Note that
		124	* btrfs_find_next_key() can release our path and do another search
		125	* without COWing, this is because even with path->keep_locks = 1,
		126	* btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
		127	* node when path->slots[node_level - 1] does not point to the last
		128	* item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
		129	* we search for the next key after reallocating our node.
		130	*/
		131	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
		132	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
		133	min_trans);
113	if (next_key_ret == 0) {	134	if (next_key_ret == 0) {
114	memcpy(&root->defrag_progress, &key, sizeof(key));	135	memcpy(&root->defrag_progress, &key, sizeof(key));
115	ret = -EAGAIN;	136	ret = -EAGAIN;


diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 54d2d2cc2c92..a37cc0478bb2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c
@@ -4825,20 +4825,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4825	goto out;	4825	goto out;
4826	}	4826	}
4827		4827
		4828	/*
		4829	* Take the device list mutex to prevent races with the final phase of
		4830	* a device replace operation that replaces the device object associated
		4831	* with the map's stripes, because the device object's id can change
		4832	* at any time during that final phase of the device replace operation
		4833	* (dev-replace.c:btrfs_dev_replace_finishing()).
		4834	*/
		4835	mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
4828	for (i = 0; i < map->num_stripes; i++) {	4836	for (i = 0; i < map->num_stripes; i++) {
4829	device = map->stripes[i].dev;	4837	device = map->stripes[i].dev;
4830	dev_offset = map->stripes[i].physical;	4838	dev_offset = map->stripes[i].physical;
4831		4839
4832	ret = btrfs_update_device(trans, device);	4840	ret = btrfs_update_device(trans, device);
4833	if (ret)	4841	if (ret)
4834	goto out;	4842	break;
4835	ret = btrfs_alloc_dev_extent(trans, device,	4843	ret = btrfs_alloc_dev_extent(trans, device,
4836	chunk_root->root_key.objectid,	4844	chunk_root->root_key.objectid,
4837	BTRFS_FIRST_CHUNK_TREE_OBJECTID,	4845	BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4838	chunk_offset, dev_offset,	4846	chunk_offset, dev_offset,
4839	stripe_size);	4847	stripe_size);
4840	if (ret)	4848	if (ret)
4841	goto out;	4849	break;
		4850	}
		4851	if (ret) {
		4852	mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
		4853	goto out;
4842	}	4854	}
4843		4855
4844	stripe = &chunk->stripe;	4856	stripe = &chunk->stripe;
@@ -4851,6 +4863,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4851	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);	4863	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4852	stripe++;	4864	stripe++;
4853	}	4865	}
		4866	mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
4854		4867
4855	btrfs_set_stack_chunk_length(chunk, chunk_size);	4868	btrfs_set_stack_chunk_length(chunk, chunk_size);
4856	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);	4869	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);