diff options
Diffstat (limited to 'fs')
31 files changed, 2739 insertions, 641 deletions
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index cb7f3fe9c9f6..d897ef803b3b 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c | |||
@@ -94,6 +94,7 @@ | |||
94 | #include <linux/mutex.h> | 94 | #include <linux/mutex.h> |
95 | #include <linux/genhd.h> | 95 | #include <linux/genhd.h> |
96 | #include <linux/blkdev.h> | 96 | #include <linux/blkdev.h> |
97 | #include <linux/vmalloc.h> | ||
97 | #include "ctree.h" | 98 | #include "ctree.h" |
98 | #include "disk-io.h" | 99 | #include "disk-io.h" |
99 | #include "hash.h" | 100 | #include "hash.h" |
@@ -326,9 +327,6 @@ static int btrfsic_handle_extent_data(struct btrfsic_state *state, | |||
326 | static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | 327 | static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, |
327 | struct btrfsic_block_data_ctx *block_ctx_out, | 328 | struct btrfsic_block_data_ctx *block_ctx_out, |
328 | int mirror_num); | 329 | int mirror_num); |
329 | static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, | ||
330 | u32 len, struct block_device *bdev, | ||
331 | struct btrfsic_block_data_ctx *block_ctx_out); | ||
332 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); | 330 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); |
333 | static int btrfsic_read_block(struct btrfsic_state *state, | 331 | static int btrfsic_read_block(struct btrfsic_state *state, |
334 | struct btrfsic_block_data_ctx *block_ctx); | 332 | struct btrfsic_block_data_ctx *block_ctx); |
@@ -1326,24 +1324,25 @@ static int btrfsic_create_link_to_next_block( | |||
1326 | l = NULL; | 1324 | l = NULL; |
1327 | next_block->generation = BTRFSIC_GENERATION_UNKNOWN; | 1325 | next_block->generation = BTRFSIC_GENERATION_UNKNOWN; |
1328 | } else { | 1326 | } else { |
1329 | if (next_block->logical_bytenr != next_bytenr && | 1327 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { |
1330 | !(!next_block->is_metadata && | 1328 | if (next_block->logical_bytenr != next_bytenr && |
1331 | 0 == next_block->logical_bytenr)) { | 1329 | !(!next_block->is_metadata && |
1332 | printk(KERN_INFO | 1330 | 0 == next_block->logical_bytenr)) |
1333 | "Referenced block @%llu (%s/%llu/%d)" | 1331 | printk(KERN_INFO |
1334 | " found in hash table, %c," | 1332 | "Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", |
1335 | " bytenr mismatch (!= stored %llu).\n", | 1333 | next_bytenr, next_block_ctx->dev->name, |
1336 | next_bytenr, next_block_ctx->dev->name, | 1334 | next_block_ctx->dev_bytenr, *mirror_nump, |
1337 | next_block_ctx->dev_bytenr, *mirror_nump, | 1335 | btrfsic_get_block_type(state, |
1338 | btrfsic_get_block_type(state, next_block), | 1336 | next_block), |
1339 | next_block->logical_bytenr); | 1337 | next_block->logical_bytenr); |
1340 | } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | 1338 | else |
1341 | printk(KERN_INFO | 1339 | printk(KERN_INFO |
1342 | "Referenced block @%llu (%s/%llu/%d)" | 1340 | "Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", |
1343 | " found in hash table, %c.\n", | 1341 | next_bytenr, next_block_ctx->dev->name, |
1344 | next_bytenr, next_block_ctx->dev->name, | 1342 | next_block_ctx->dev_bytenr, *mirror_nump, |
1345 | next_block_ctx->dev_bytenr, *mirror_nump, | 1343 | btrfsic_get_block_type(state, |
1346 | btrfsic_get_block_type(state, next_block)); | 1344 | next_block)); |
1345 | } | ||
1347 | next_block->logical_bytenr = next_bytenr; | 1346 | next_block->logical_bytenr = next_bytenr; |
1348 | 1347 | ||
1349 | next_block->mirror_num = *mirror_nump; | 1348 | next_block->mirror_num = *mirror_nump; |
@@ -1529,7 +1528,9 @@ static int btrfsic_handle_extent_data( | |||
1529 | return -1; | 1528 | return -1; |
1530 | } | 1529 | } |
1531 | if (!block_was_created) { | 1530 | if (!block_was_created) { |
1532 | if (next_block->logical_bytenr != next_bytenr && | 1531 | if ((state->print_mask & |
1532 | BTRFSIC_PRINT_MASK_VERBOSE) && | ||
1533 | next_block->logical_bytenr != next_bytenr && | ||
1533 | !(!next_block->is_metadata && | 1534 | !(!next_block->is_metadata && |
1534 | 0 == next_block->logical_bytenr)) { | 1535 | 0 == next_block->logical_bytenr)) { |
1535 | printk(KERN_INFO | 1536 | printk(KERN_INFO |
@@ -1607,25 +1608,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, | |||
1607 | return ret; | 1608 | return ret; |
1608 | } | 1609 | } |
1609 | 1610 | ||
1610 | static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr, | ||
1611 | u32 len, struct block_device *bdev, | ||
1612 | struct btrfsic_block_data_ctx *block_ctx_out) | ||
1613 | { | ||
1614 | block_ctx_out->dev = btrfsic_dev_state_lookup(bdev); | ||
1615 | block_ctx_out->dev_bytenr = bytenr; | ||
1616 | block_ctx_out->start = bytenr; | ||
1617 | block_ctx_out->len = len; | ||
1618 | block_ctx_out->datav = NULL; | ||
1619 | block_ctx_out->pagev = NULL; | ||
1620 | block_ctx_out->mem_to_free = NULL; | ||
1621 | if (NULL != block_ctx_out->dev) { | ||
1622 | return 0; | ||
1623 | } else { | ||
1624 | printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n"); | ||
1625 | return -ENXIO; | ||
1626 | } | ||
1627 | } | ||
1628 | |||
1629 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) | 1611 | static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) |
1630 | { | 1612 | { |
1631 | if (block_ctx->mem_to_free) { | 1613 | if (block_ctx->mem_to_free) { |
@@ -1901,25 +1883,26 @@ again: | |||
1901 | dev_state, | 1883 | dev_state, |
1902 | dev_bytenr); | 1884 | dev_bytenr); |
1903 | } | 1885 | } |
1904 | if (block->logical_bytenr != bytenr && | 1886 | if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { |
1905 | !(!block->is_metadata && | 1887 | if (block->logical_bytenr != bytenr && |
1906 | block->logical_bytenr == 0)) | 1888 | !(!block->is_metadata && |
1907 | printk(KERN_INFO | 1889 | block->logical_bytenr == 0)) |
1908 | "Written block @%llu (%s/%llu/%d)" | 1890 | printk(KERN_INFO |
1909 | " found in hash table, %c," | 1891 | "Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", |
1910 | " bytenr mismatch" | 1892 | bytenr, dev_state->name, |
1911 | " (!= stored %llu).\n", | 1893 | dev_bytenr, |
1912 | bytenr, dev_state->name, dev_bytenr, | 1894 | block->mirror_num, |
1913 | block->mirror_num, | 1895 | btrfsic_get_block_type(state, |
1914 | btrfsic_get_block_type(state, block), | 1896 | block), |
1915 | block->logical_bytenr); | 1897 | block->logical_bytenr); |
1916 | else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) | 1898 | else |
1917 | printk(KERN_INFO | 1899 | printk(KERN_INFO |
1918 | "Written block @%llu (%s/%llu/%d)" | 1900 | "Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", |
1919 | " found in hash table, %c.\n", | 1901 | bytenr, dev_state->name, |
1920 | bytenr, dev_state->name, dev_bytenr, | 1902 | dev_bytenr, block->mirror_num, |
1921 | block->mirror_num, | 1903 | btrfsic_get_block_type(state, |
1922 | btrfsic_get_block_type(state, block)); | 1904 | block)); |
1905 | } | ||
1923 | block->logical_bytenr = bytenr; | 1906 | block->logical_bytenr = bytenr; |
1924 | } else { | 1907 | } else { |
1925 | if (num_pages * PAGE_CACHE_SIZE < | 1908 | if (num_pages * PAGE_CACHE_SIZE < |
@@ -2002,24 +1985,13 @@ again: | |||
2002 | } | 1985 | } |
2003 | } | 1986 | } |
2004 | 1987 | ||
2005 | if (block->is_superblock) | ||
2006 | ret = btrfsic_map_superblock(state, bytenr, | ||
2007 | processed_len, | ||
2008 | bdev, &block_ctx); | ||
2009 | else | ||
2010 | ret = btrfsic_map_block(state, bytenr, processed_len, | ||
2011 | &block_ctx, 0); | ||
2012 | if (ret) { | ||
2013 | printk(KERN_INFO | ||
2014 | "btrfsic: btrfsic_map_block(root @%llu)" | ||
2015 | " failed!\n", bytenr); | ||
2016 | goto continue_loop; | ||
2017 | } | ||
2018 | block_ctx.datav = mapped_datav; | ||
2019 | /* the following is required in case of writes to mirrors, | ||
2020 | * use the same that was used for the lookup */ | ||
2021 | block_ctx.dev = dev_state; | 1988 | block_ctx.dev = dev_state; |
2022 | block_ctx.dev_bytenr = dev_bytenr; | 1989 | block_ctx.dev_bytenr = dev_bytenr; |
1990 | block_ctx.start = bytenr; | ||
1991 | block_ctx.len = processed_len; | ||
1992 | block_ctx.pagev = NULL; | ||
1993 | block_ctx.mem_to_free = NULL; | ||
1994 | block_ctx.datav = mapped_datav; | ||
2023 | 1995 | ||
2024 | if (is_metadata || state->include_extent_data) { | 1996 | if (is_metadata || state->include_extent_data) { |
2025 | block->never_written = 0; | 1997 | block->never_written = 0; |
@@ -2133,10 +2105,6 @@ again: | |||
2133 | /* this is getting ugly for the | 2105 | /* this is getting ugly for the |
2134 | * include_extent_data case... */ | 2106 | * include_extent_data case... */ |
2135 | bytenr = 0; /* unknown */ | 2107 | bytenr = 0; /* unknown */ |
2136 | block_ctx.start = bytenr; | ||
2137 | block_ctx.len = processed_len; | ||
2138 | block_ctx.mem_to_free = NULL; | ||
2139 | block_ctx.pagev = NULL; | ||
2140 | } else { | 2108 | } else { |
2141 | processed_len = state->metablock_size; | 2109 | processed_len = state->metablock_size; |
2142 | bytenr = btrfs_stack_header_bytenr( | 2110 | bytenr = btrfs_stack_header_bytenr( |
@@ -2149,22 +2117,15 @@ again: | |||
2149 | "Written block @%llu (%s/%llu/?)" | 2117 | "Written block @%llu (%s/%llu/?)" |
2150 | " !found in hash table, M.\n", | 2118 | " !found in hash table, M.\n", |
2151 | bytenr, dev_state->name, dev_bytenr); | 2119 | bytenr, dev_state->name, dev_bytenr); |
2152 | |||
2153 | ret = btrfsic_map_block(state, bytenr, processed_len, | ||
2154 | &block_ctx, 0); | ||
2155 | if (ret) { | ||
2156 | printk(KERN_INFO | ||
2157 | "btrfsic: btrfsic_map_block(root @%llu)" | ||
2158 | " failed!\n", | ||
2159 | dev_bytenr); | ||
2160 | goto continue_loop; | ||
2161 | } | ||
2162 | } | 2120 | } |
2163 | block_ctx.datav = mapped_datav; | 2121 | |
2164 | /* the following is required in case of writes to mirrors, | ||
2165 | * use the same that was used for the lookup */ | ||
2166 | block_ctx.dev = dev_state; | 2122 | block_ctx.dev = dev_state; |
2167 | block_ctx.dev_bytenr = dev_bytenr; | 2123 | block_ctx.dev_bytenr = dev_bytenr; |
2124 | block_ctx.start = bytenr; | ||
2125 | block_ctx.len = processed_len; | ||
2126 | block_ctx.pagev = NULL; | ||
2127 | block_ctx.mem_to_free = NULL; | ||
2128 | block_ctx.datav = mapped_datav; | ||
2168 | 2129 | ||
2169 | block = btrfsic_block_alloc(); | 2130 | block = btrfsic_block_alloc(); |
2170 | if (NULL == block) { | 2131 | if (NULL == block) { |
@@ -3130,10 +3091,13 @@ int btrfsic_mount(struct btrfs_root *root, | |||
3130 | root->sectorsize, PAGE_CACHE_SIZE); | 3091 | root->sectorsize, PAGE_CACHE_SIZE); |
3131 | return -1; | 3092 | return -1; |
3132 | } | 3093 | } |
3133 | state = kzalloc(sizeof(*state), GFP_NOFS); | 3094 | state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); |
3134 | if (NULL == state) { | 3095 | if (!state) { |
3135 | printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n"); | 3096 | state = vzalloc(sizeof(*state)); |
3136 | return -1; | 3097 | if (!state) { |
3098 | printk(KERN_INFO "btrfs check-integrity: vzalloc() failed!\n"); | ||
3099 | return -1; | ||
3100 | } | ||
3137 | } | 3101 | } |
3138 | 3102 | ||
3139 | if (!btrfsic_is_initialized) { | 3103 | if (!btrfsic_is_initialized) { |
@@ -3277,5 +3241,8 @@ void btrfsic_unmount(struct btrfs_root *root, | |||
3277 | 3241 | ||
3278 | mutex_unlock(&btrfsic_mutex); | 3242 | mutex_unlock(&btrfsic_mutex); |
3279 | 3243 | ||
3280 | kfree(state); | 3244 | if (is_vmalloc_addr(state)) |
3245 | vfree(state); | ||
3246 | else | ||
3247 | kfree(state); | ||
3281 | } | 3248 | } |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index dcd9be32ac57..e9df8862012c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -224,16 +224,19 @@ out: | |||
224 | * Clear the writeback bits on all of the file | 224 | * Clear the writeback bits on all of the file |
225 | * pages for a compressed write | 225 | * pages for a compressed write |
226 | */ | 226 | */ |
227 | static noinline void end_compressed_writeback(struct inode *inode, u64 start, | 227 | static noinline void end_compressed_writeback(struct inode *inode, |
228 | unsigned long ram_size) | 228 | const struct compressed_bio *cb) |
229 | { | 229 | { |
230 | unsigned long index = start >> PAGE_CACHE_SHIFT; | 230 | unsigned long index = cb->start >> PAGE_CACHE_SHIFT; |
231 | unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; | 231 | unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_CACHE_SHIFT; |
232 | struct page *pages[16]; | 232 | struct page *pages[16]; |
233 | unsigned long nr_pages = end_index - index + 1; | 233 | unsigned long nr_pages = end_index - index + 1; |
234 | int i; | 234 | int i; |
235 | int ret; | 235 | int ret; |
236 | 236 | ||
237 | if (cb->errors) | ||
238 | mapping_set_error(inode->i_mapping, -EIO); | ||
239 | |||
237 | while (nr_pages > 0) { | 240 | while (nr_pages > 0) { |
238 | ret = find_get_pages_contig(inode->i_mapping, index, | 241 | ret = find_get_pages_contig(inode->i_mapping, index, |
239 | min_t(unsigned long, | 242 | min_t(unsigned long, |
@@ -244,6 +247,8 @@ static noinline void end_compressed_writeback(struct inode *inode, u64 start, | |||
244 | continue; | 247 | continue; |
245 | } | 248 | } |
246 | for (i = 0; i < ret; i++) { | 249 | for (i = 0; i < ret; i++) { |
250 | if (cb->errors) | ||
251 | SetPageError(pages[i]); | ||
247 | end_page_writeback(pages[i]); | 252 | end_page_writeback(pages[i]); |
248 | page_cache_release(pages[i]); | 253 | page_cache_release(pages[i]); |
249 | } | 254 | } |
@@ -287,10 +292,11 @@ static void end_compressed_bio_write(struct bio *bio, int err) | |||
287 | tree->ops->writepage_end_io_hook(cb->compressed_pages[0], | 292 | tree->ops->writepage_end_io_hook(cb->compressed_pages[0], |
288 | cb->start, | 293 | cb->start, |
289 | cb->start + cb->len - 1, | 294 | cb->start + cb->len - 1, |
290 | NULL, 1); | 295 | NULL, |
296 | err ? 0 : 1); | ||
291 | cb->compressed_pages[0]->mapping = NULL; | 297 | cb->compressed_pages[0]->mapping = NULL; |
292 | 298 | ||
293 | end_compressed_writeback(inode, cb->start, cb->len); | 299 | end_compressed_writeback(inode, cb); |
294 | /* note, our inode could be gone now */ | 300 | /* note, our inode could be gone now */ |
295 | 301 | ||
296 | /* | 302 | /* |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 150822ee0a0b..14a72ed14ef7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
@@ -2929,7 +2929,7 @@ done: | |||
2929 | */ | 2929 | */ |
2930 | if (!p->leave_spinning) | 2930 | if (!p->leave_spinning) |
2931 | btrfs_set_path_blocking(p); | 2931 | btrfs_set_path_blocking(p); |
2932 | if (ret < 0) | 2932 | if (ret < 0 && !p->skip_release_on_error) |
2933 | btrfs_release_path(p); | 2933 | btrfs_release_path(p); |
2934 | return ret; | 2934 | return ret; |
2935 | } | 2935 | } |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fe69edda11fb..e6fbbd74b716 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -607,6 +607,7 @@ struct btrfs_path { | |||
607 | unsigned int leave_spinning:1; | 607 | unsigned int leave_spinning:1; |
608 | unsigned int search_commit_root:1; | 608 | unsigned int search_commit_root:1; |
609 | unsigned int need_commit_sem:1; | 609 | unsigned int need_commit_sem:1; |
610 | unsigned int skip_release_on_error:1; | ||
610 | }; | 611 | }; |
611 | 612 | ||
612 | /* | 613 | /* |
@@ -1170,6 +1171,7 @@ struct btrfs_space_info { | |||
1170 | struct percpu_counter total_bytes_pinned; | 1171 | struct percpu_counter total_bytes_pinned; |
1171 | 1172 | ||
1172 | struct list_head list; | 1173 | struct list_head list; |
1174 | struct list_head ro_bgs; | ||
1173 | 1175 | ||
1174 | struct rw_semaphore groups_sem; | 1176 | struct rw_semaphore groups_sem; |
1175 | /* for block groups in our same type */ | 1177 | /* for block groups in our same type */ |
@@ -1276,6 +1278,8 @@ struct btrfs_block_group_cache { | |||
1276 | unsigned int ro:1; | 1278 | unsigned int ro:1; |
1277 | unsigned int dirty:1; | 1279 | unsigned int dirty:1; |
1278 | unsigned int iref:1; | 1280 | unsigned int iref:1; |
1281 | unsigned int has_caching_ctl:1; | ||
1282 | unsigned int removed:1; | ||
1279 | 1283 | ||
1280 | int disk_cache_state; | 1284 | int disk_cache_state; |
1281 | 1285 | ||
@@ -1305,6 +1309,11 @@ struct btrfs_block_group_cache { | |||
1305 | 1309 | ||
1306 | /* For delayed block group creation or deletion of empty block groups */ | 1310 | /* For delayed block group creation or deletion of empty block groups */ |
1307 | struct list_head bg_list; | 1311 | struct list_head bg_list; |
1312 | |||
1313 | /* For read-only block groups */ | ||
1314 | struct list_head ro_list; | ||
1315 | |||
1316 | atomic_t trimming; | ||
1308 | }; | 1317 | }; |
1309 | 1318 | ||
1310 | /* delayed seq elem */ | 1319 | /* delayed seq elem */ |
@@ -1402,6 +1411,11 @@ struct btrfs_fs_info { | |||
1402 | */ | 1411 | */ |
1403 | u64 last_trans_log_full_commit; | 1412 | u64 last_trans_log_full_commit; |
1404 | unsigned long mount_opt; | 1413 | unsigned long mount_opt; |
1414 | /* | ||
1415 | * Track requests for actions that need to be done during transaction | ||
1416 | * commit (like for some mount options). | ||
1417 | */ | ||
1418 | unsigned long pending_changes; | ||
1405 | unsigned long compress_type:4; | 1419 | unsigned long compress_type:4; |
1406 | int commit_interval; | 1420 | int commit_interval; |
1407 | /* | 1421 | /* |
@@ -1729,6 +1743,12 @@ struct btrfs_fs_info { | |||
1729 | 1743 | ||
1730 | /* For btrfs to record security options */ | 1744 | /* For btrfs to record security options */ |
1731 | struct security_mnt_opts security_opts; | 1745 | struct security_mnt_opts security_opts; |
1746 | |||
1747 | /* | ||
1748 | * Chunks that can't be freed yet (under a trim/discard operation) | ||
1749 | * and will be latter freed. Protected by fs_info->chunk_mutex. | ||
1750 | */ | ||
1751 | struct list_head pinned_chunks; | ||
1732 | }; | 1752 | }; |
1733 | 1753 | ||
1734 | struct btrfs_subvolume_writers { | 1754 | struct btrfs_subvolume_writers { |
@@ -2093,7 +2113,6 @@ struct btrfs_ioctl_defrag_range_args { | |||
2093 | #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) | 2113 | #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) |
2094 | #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) | 2114 | #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) |
2095 | #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) | 2115 | #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) |
2096 | #define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) | ||
2097 | 2116 | ||
2098 | #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) | 2117 | #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) |
2099 | #define BTRFS_DEFAULT_MAX_INLINE (8192) | 2118 | #define BTRFS_DEFAULT_MAX_INLINE (8192) |
@@ -2103,6 +2122,7 @@ struct btrfs_ioctl_defrag_range_args { | |||
2103 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) | 2122 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) |
2104 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ | 2123 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ |
2105 | BTRFS_MOUNT_##opt) | 2124 | BTRFS_MOUNT_##opt) |
2125 | |||
2106 | #define btrfs_set_and_info(root, opt, fmt, args...) \ | 2126 | #define btrfs_set_and_info(root, opt, fmt, args...) \ |
2107 | { \ | 2127 | { \ |
2108 | if (!btrfs_test_opt(root, opt)) \ | 2128 | if (!btrfs_test_opt(root, opt)) \ |
@@ -2118,6 +2138,49 @@ struct btrfs_ioctl_defrag_range_args { | |||
2118 | } | 2138 | } |
2119 | 2139 | ||
2120 | /* | 2140 | /* |
2141 | * Requests for changes that need to be done during transaction commit. | ||
2142 | * | ||
2143 | * Internal mount options that are used for special handling of the real | ||
2144 | * mount options (eg. cannot be set during remount and have to be set during | ||
2145 | * transaction commit) | ||
2146 | */ | ||
2147 | |||
2148 | #define BTRFS_PENDING_SET_INODE_MAP_CACHE (0) | ||
2149 | #define BTRFS_PENDING_CLEAR_INODE_MAP_CACHE (1) | ||
2150 | #define BTRFS_PENDING_COMMIT (2) | ||
2151 | |||
2152 | #define btrfs_test_pending(info, opt) \ | ||
2153 | test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) | ||
2154 | #define btrfs_set_pending(info, opt) \ | ||
2155 | set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) | ||
2156 | #define btrfs_clear_pending(info, opt) \ | ||
2157 | clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) | ||
2158 | |||
2159 | /* | ||
2160 | * Helpers for setting pending mount option changes. | ||
2161 | * | ||
2162 | * Expects corresponding macros | ||
2163 | * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name | ||
2164 | */ | ||
2165 | #define btrfs_set_pending_and_info(info, opt, fmt, args...) \ | ||
2166 | do { \ | ||
2167 | if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ | ||
2168 | btrfs_info((info), fmt, ##args); \ | ||
2169 | btrfs_set_pending((info), SET_##opt); \ | ||
2170 | btrfs_clear_pending((info), CLEAR_##opt); \ | ||
2171 | } \ | ||
2172 | } while(0) | ||
2173 | |||
2174 | #define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ | ||
2175 | do { \ | ||
2176 | if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ | ||
2177 | btrfs_info((info), fmt, ##args); \ | ||
2178 | btrfs_set_pending((info), CLEAR_##opt); \ | ||
2179 | btrfs_clear_pending((info), SET_##opt); \ | ||
2180 | } \ | ||
2181 | } while(0) | ||
2182 | |||
2183 | /* | ||
2121 | * Inode flags | 2184 | * Inode flags |
2122 | */ | 2185 | */ |
2123 | #define BTRFS_INODE_NODATASUM (1 << 0) | 2186 | #define BTRFS_INODE_NODATASUM (1 << 0) |
@@ -3351,7 +3414,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
3351 | u64 type, u64 chunk_objectid, u64 chunk_offset, | 3414 | u64 type, u64 chunk_objectid, u64 chunk_offset, |
3352 | u64 size); | 3415 | u64 size); |
3353 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 3416 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
3354 | struct btrfs_root *root, u64 group_start); | 3417 | struct btrfs_root *root, u64 group_start, |
3418 | struct extent_map *em); | ||
3355 | void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); | 3419 | void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); |
3356 | void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, | 3420 | void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, |
3357 | struct btrfs_root *root); | 3421 | struct btrfs_root *root); |
@@ -3427,8 +3491,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); | |||
3427 | int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, | 3491 | int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, |
3428 | struct btrfs_fs_info *fs_info); | 3492 | struct btrfs_fs_info *fs_info); |
3429 | int __get_raid_index(u64 flags); | 3493 | int __get_raid_index(u64 flags); |
3430 | int btrfs_start_nocow_write(struct btrfs_root *root); | 3494 | int btrfs_start_write_no_snapshoting(struct btrfs_root *root); |
3431 | void btrfs_end_nocow_write(struct btrfs_root *root); | 3495 | void btrfs_end_write_no_snapshoting(struct btrfs_root *root); |
3432 | /* ctree.c */ | 3496 | /* ctree.c */ |
3433 | int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, | 3497 | int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, |
3434 | int level, int *slot); | 3498 | int level, int *slot); |
@@ -3686,6 +3750,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
3686 | int verify_dir_item(struct btrfs_root *root, | 3750 | int verify_dir_item(struct btrfs_root *root, |
3687 | struct extent_buffer *leaf, | 3751 | struct extent_buffer *leaf, |
3688 | struct btrfs_dir_item *dir_item); | 3752 | struct btrfs_dir_item *dir_item); |
3753 | struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, | ||
3754 | struct btrfs_path *path, | ||
3755 | const char *name, | ||
3756 | int name_len); | ||
3689 | 3757 | ||
3690 | /* orphan.c */ | 3758 | /* orphan.c */ |
3691 | int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, | 3759 | int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, |
@@ -3857,6 +3925,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, | |||
3857 | struct btrfs_trans_handle *trans, int mode, | 3925 | struct btrfs_trans_handle *trans, int mode, |
3858 | u64 start, u64 num_bytes, u64 min_size, | 3926 | u64 start, u64 num_bytes, u64 min_size, |
3859 | loff_t actual_len, u64 *alloc_hint); | 3927 | loff_t actual_len, u64 *alloc_hint); |
3928 | int btrfs_inode_check_errors(struct inode *inode); | ||
3860 | extern const struct dentry_operations btrfs_dentry_operations; | 3929 | extern const struct dentry_operations btrfs_dentry_operations; |
3861 | 3930 | ||
3862 | /* ioctl.c */ | 3931 | /* ioctl.c */ |
@@ -3901,6 +3970,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, | |||
3901 | struct page **pages, size_t num_pages, | 3970 | struct page **pages, size_t num_pages, |
3902 | loff_t pos, size_t write_bytes, | 3971 | loff_t pos, size_t write_bytes, |
3903 | struct extent_state **cached); | 3972 | struct extent_state **cached); |
3973 | int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); | ||
3904 | 3974 | ||
3905 | /* tree-defrag.c */ | 3975 | /* tree-defrag.c */ |
3906 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | 3976 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
@@ -4097,7 +4167,12 @@ int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, | |||
4097 | /* dev-replace.c */ | 4167 | /* dev-replace.c */ |
4098 | void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); | 4168 | void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); |
4099 | void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); | 4169 | void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info); |
4100 | void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info); | 4170 | void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); |
4171 | |||
4172 | static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) | ||
4173 | { | ||
4174 | btrfs_bio_counter_sub(fs_info, 1); | ||
4175 | } | ||
4101 | 4176 | ||
4102 | /* reada.c */ | 4177 | /* reada.c */ |
4103 | struct reada_control { | 4178 | struct reada_control { |
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 6f662b34ba0e..ca6a3a3b6b6c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c | |||
@@ -316,11 +316,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, | |||
316 | struct btrfs_device *tgt_device = NULL; | 316 | struct btrfs_device *tgt_device = NULL; |
317 | struct btrfs_device *src_device = NULL; | 317 | struct btrfs_device *src_device = NULL; |
318 | 318 | ||
319 | if (btrfs_fs_incompat(fs_info, RAID56)) { | ||
320 | btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6"); | ||
321 | return -EOPNOTSUPP; | ||
322 | } | ||
323 | |||
324 | switch (args->start.cont_reading_from_srcdev_mode) { | 319 | switch (args->start.cont_reading_from_srcdev_mode) { |
325 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: | 320 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS: |
326 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: | 321 | case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID: |
@@ -422,9 +417,15 @@ int btrfs_dev_replace_start(struct btrfs_root *root, | |||
422 | &dev_replace->scrub_progress, 0, 1); | 417 | &dev_replace->scrub_progress, 0, 1); |
423 | 418 | ||
424 | ret = btrfs_dev_replace_finishing(root->fs_info, ret); | 419 | ret = btrfs_dev_replace_finishing(root->fs_info, ret); |
425 | WARN_ON(ret); | 420 | /* don't warn if EINPROGRESS, someone else might be running scrub */ |
421 | if (ret == -EINPROGRESS) { | ||
422 | args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; | ||
423 | ret = 0; | ||
424 | } else { | ||
425 | WARN_ON(ret); | ||
426 | } | ||
426 | 427 | ||
427 | return 0; | 428 | return ret; |
428 | 429 | ||
429 | leave: | 430 | leave: |
430 | dev_replace->srcdev = NULL; | 431 | dev_replace->srcdev = NULL; |
@@ -542,7 +543,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
542 | btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); | 543 | btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); |
543 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | 544 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); |
544 | 545 | ||
545 | return 0; | 546 | return scrub_ret; |
546 | } | 547 | } |
547 | 548 | ||
548 | printk_in_rcu(KERN_INFO | 549 | printk_in_rcu(KERN_INFO |
@@ -571,15 +572,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
571 | list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); | 572 | list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); |
572 | fs_info->fs_devices->rw_devices++; | 573 | fs_info->fs_devices->rw_devices++; |
573 | 574 | ||
574 | /* replace the sysfs entry */ | ||
575 | btrfs_kobj_rm_device(fs_info, src_device); | ||
576 | btrfs_kobj_add_device(fs_info, tgt_device); | ||
577 | |||
578 | btrfs_dev_replace_unlock(dev_replace); | 575 | btrfs_dev_replace_unlock(dev_replace); |
579 | 576 | ||
580 | btrfs_rm_dev_replace_blocked(fs_info); | 577 | btrfs_rm_dev_replace_blocked(fs_info); |
581 | 578 | ||
582 | btrfs_rm_dev_replace_srcdev(fs_info, src_device); | 579 | btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device); |
583 | 580 | ||
584 | btrfs_rm_dev_replace_unblocked(fs_info); | 581 | btrfs_rm_dev_replace_unblocked(fs_info); |
585 | 582 | ||
@@ -594,6 +591,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
594 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); | 591 | mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); |
595 | mutex_unlock(&uuid_mutex); | 592 | mutex_unlock(&uuid_mutex); |
596 | 593 | ||
594 | /* replace the sysfs entry */ | ||
595 | btrfs_kobj_rm_device(fs_info, src_device); | ||
596 | btrfs_kobj_add_device(fs_info, tgt_device); | ||
597 | btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); | ||
598 | |||
597 | /* write back the superblocks */ | 599 | /* write back the superblocks */ |
598 | trans = btrfs_start_transaction(root, 0); | 600 | trans = btrfs_start_transaction(root, 0); |
599 | if (!IS_ERR(trans)) | 601 | if (!IS_ERR(trans)) |
@@ -920,9 +922,9 @@ void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info) | |||
920 | percpu_counter_inc(&fs_info->bio_counter); | 922 | percpu_counter_inc(&fs_info->bio_counter); |
921 | } | 923 | } |
922 | 924 | ||
923 | void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) | 925 | void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) |
924 | { | 926 | { |
925 | percpu_counter_dec(&fs_info->bio_counter); | 927 | percpu_counter_sub(&fs_info->bio_counter, amount); |
926 | 928 | ||
927 | if (waitqueue_active(&fs_info->replace_wait)) | 929 | if (waitqueue_active(&fs_info->replace_wait)) |
928 | wake_up(&fs_info->replace_wait); | 930 | wake_up(&fs_info->replace_wait); |
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index fc8df866e919..1752625fb4dd 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c | |||
@@ -21,10 +21,6 @@ | |||
21 | #include "hash.h" | 21 | #include "hash.h" |
22 | #include "transaction.h" | 22 | #include "transaction.h" |
23 | 23 | ||
24 | static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, | ||
25 | struct btrfs_path *path, | ||
26 | const char *name, int name_len); | ||
27 | |||
28 | /* | 24 | /* |
29 | * insert a name into a directory, doing overflow properly if there is a hash | 25 | * insert a name into a directory, doing overflow properly if there is a hash |
30 | * collision. data_size indicates how big the item inserted should be. On | 26 | * collision. data_size indicates how big the item inserted should be. On |
@@ -383,9 +379,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
383 | * this walks through all the entries in a dir item and finds one | 379 | * this walks through all the entries in a dir item and finds one |
384 | * for a specific name. | 380 | * for a specific name. |
385 | */ | 381 | */ |
386 | static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, | 382 | struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, |
387 | struct btrfs_path *path, | 383 | struct btrfs_path *path, |
388 | const char *name, int name_len) | 384 | const char *name, int name_len) |
389 | { | 385 | { |
390 | struct btrfs_dir_item *dir_item; | 386 | struct btrfs_dir_item *dir_item; |
391 | unsigned long name_ptr; | 387 | unsigned long name_ptr; |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1bf9f897065d..30965120772b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -2384,6 +2384,8 @@ int open_ctree(struct super_block *sb, | |||
2384 | init_waitqueue_head(&fs_info->transaction_blocked_wait); | 2384 | init_waitqueue_head(&fs_info->transaction_blocked_wait); |
2385 | init_waitqueue_head(&fs_info->async_submit_wait); | 2385 | init_waitqueue_head(&fs_info->async_submit_wait); |
2386 | 2386 | ||
2387 | INIT_LIST_HEAD(&fs_info->pinned_chunks); | ||
2388 | |||
2387 | ret = btrfs_alloc_stripe_hash_table(fs_info); | 2389 | ret = btrfs_alloc_stripe_hash_table(fs_info); |
2388 | if (ret) { | 2390 | if (ret) { |
2389 | err = ret; | 2391 | err = ret; |
@@ -2830,9 +2832,11 @@ retry_root_backup: | |||
2830 | btrfs_set_opt(fs_info->mount_opt, SSD); | 2832 | btrfs_set_opt(fs_info->mount_opt, SSD); |
2831 | } | 2833 | } |
2832 | 2834 | ||
2833 | /* Set the real inode map cache flag */ | 2835 | /* |
2834 | if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE)) | 2836 | * Mount does not set all options immediatelly, we can do it now and do |
2835 | btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE); | 2837 | * not have to wait for transaction commit |
2838 | */ | ||
2839 | btrfs_apply_pending_changes(fs_info); | ||
2836 | 2840 | ||
2837 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY | 2841 | #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY |
2838 | if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { | 2842 | if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) { |
@@ -3713,6 +3717,17 @@ void close_ctree(struct btrfs_root *root) | |||
3713 | 3717 | ||
3714 | btrfs_free_block_rsv(root, root->orphan_block_rsv); | 3718 | btrfs_free_block_rsv(root, root->orphan_block_rsv); |
3715 | root->orphan_block_rsv = NULL; | 3719 | root->orphan_block_rsv = NULL; |
3720 | |||
3721 | lock_chunks(root); | ||
3722 | while (!list_empty(&fs_info->pinned_chunks)) { | ||
3723 | struct extent_map *em; | ||
3724 | |||
3725 | em = list_first_entry(&fs_info->pinned_chunks, | ||
3726 | struct extent_map, list); | ||
3727 | list_del_init(&em->list); | ||
3728 | free_extent_map(em); | ||
3729 | } | ||
3730 | unlock_chunks(root); | ||
3716 | } | 3731 | } |
3717 | 3732 | ||
3718 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, | 3733 | int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, |
@@ -3839,12 +3854,12 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, | |||
3839 | */ | 3854 | */ |
3840 | if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) | 3855 | if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) |
3841 | printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", | 3856 | printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", |
3842 | sb->root); | 3857 | btrfs_super_root(sb)); |
3843 | if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) | 3858 | if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) |
3844 | printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", | 3859 | printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", |
3845 | sb->chunk_root); | 3860 | btrfs_super_chunk_root(sb)); |
3846 | if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) | 3861 | if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) |
3847 | printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", | 3862 | printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", |
3848 | btrfs_super_log_root(sb)); | 3863 | btrfs_super_log_root(sb)); |
3849 | 3864 | ||
3850 | if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { | 3865 | if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { |
@@ -4129,6 +4144,25 @@ again: | |||
4129 | return 0; | 4144 | return 0; |
4130 | } | 4145 | } |
4131 | 4146 | ||
4147 | static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans, | ||
4148 | struct btrfs_fs_info *fs_info) | ||
4149 | { | ||
4150 | struct btrfs_ordered_extent *ordered; | ||
4151 | |||
4152 | spin_lock(&fs_info->trans_lock); | ||
4153 | while (!list_empty(&cur_trans->pending_ordered)) { | ||
4154 | ordered = list_first_entry(&cur_trans->pending_ordered, | ||
4155 | struct btrfs_ordered_extent, | ||
4156 | trans_list); | ||
4157 | list_del_init(&ordered->trans_list); | ||
4158 | spin_unlock(&fs_info->trans_lock); | ||
4159 | |||
4160 | btrfs_put_ordered_extent(ordered); | ||
4161 | spin_lock(&fs_info->trans_lock); | ||
4162 | } | ||
4163 | spin_unlock(&fs_info->trans_lock); | ||
4164 | } | ||
4165 | |||
4132 | void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, | 4166 | void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, |
4133 | struct btrfs_root *root) | 4167 | struct btrfs_root *root) |
4134 | { | 4168 | { |
@@ -4140,6 +4174,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, | |||
4140 | cur_trans->state = TRANS_STATE_UNBLOCKED; | 4174 | cur_trans->state = TRANS_STATE_UNBLOCKED; |
4141 | wake_up(&root->fs_info->transaction_wait); | 4175 | wake_up(&root->fs_info->transaction_wait); |
4142 | 4176 | ||
4177 | btrfs_free_pending_ordered(cur_trans, root->fs_info); | ||
4143 | btrfs_destroy_delayed_inodes(root); | 4178 | btrfs_destroy_delayed_inodes(root); |
4144 | btrfs_assert_delayed_root_empty(root); | 4179 | btrfs_assert_delayed_root_empty(root); |
4145 | 4180 | ||
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 47c1ba141082..222d6aea4a8a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -315,12 +315,6 @@ get_caching_control(struct btrfs_block_group_cache *cache) | |||
315 | struct btrfs_caching_control *ctl; | 315 | struct btrfs_caching_control *ctl; |
316 | 316 | ||
317 | spin_lock(&cache->lock); | 317 | spin_lock(&cache->lock); |
318 | if (cache->cached != BTRFS_CACHE_STARTED) { | ||
319 | spin_unlock(&cache->lock); | ||
320 | return NULL; | ||
321 | } | ||
322 | |||
323 | /* We're loading it the fast way, so we don't have a caching_ctl. */ | ||
324 | if (!cache->caching_ctl) { | 318 | if (!cache->caching_ctl) { |
325 | spin_unlock(&cache->lock); | 319 | spin_unlock(&cache->lock); |
326 | return NULL; | 320 | return NULL; |
@@ -594,6 +588,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
594 | spin_unlock(&cache->lock); | 588 | spin_unlock(&cache->lock); |
595 | 589 | ||
596 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { | 590 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { |
591 | mutex_lock(&caching_ctl->mutex); | ||
597 | ret = load_free_space_cache(fs_info, cache); | 592 | ret = load_free_space_cache(fs_info, cache); |
598 | 593 | ||
599 | spin_lock(&cache->lock); | 594 | spin_lock(&cache->lock); |
@@ -601,15 +596,19 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
601 | cache->caching_ctl = NULL; | 596 | cache->caching_ctl = NULL; |
602 | cache->cached = BTRFS_CACHE_FINISHED; | 597 | cache->cached = BTRFS_CACHE_FINISHED; |
603 | cache->last_byte_to_unpin = (u64)-1; | 598 | cache->last_byte_to_unpin = (u64)-1; |
599 | caching_ctl->progress = (u64)-1; | ||
604 | } else { | 600 | } else { |
605 | if (load_cache_only) { | 601 | if (load_cache_only) { |
606 | cache->caching_ctl = NULL; | 602 | cache->caching_ctl = NULL; |
607 | cache->cached = BTRFS_CACHE_NO; | 603 | cache->cached = BTRFS_CACHE_NO; |
608 | } else { | 604 | } else { |
609 | cache->cached = BTRFS_CACHE_STARTED; | 605 | cache->cached = BTRFS_CACHE_STARTED; |
606 | cache->has_caching_ctl = 1; | ||
610 | } | 607 | } |
611 | } | 608 | } |
612 | spin_unlock(&cache->lock); | 609 | spin_unlock(&cache->lock); |
610 | mutex_unlock(&caching_ctl->mutex); | ||
611 | |||
613 | wake_up(&caching_ctl->wait); | 612 | wake_up(&caching_ctl->wait); |
614 | if (ret == 1) { | 613 | if (ret == 1) { |
615 | put_caching_control(caching_ctl); | 614 | put_caching_control(caching_ctl); |
@@ -627,6 +626,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
627 | cache->cached = BTRFS_CACHE_NO; | 626 | cache->cached = BTRFS_CACHE_NO; |
628 | } else { | 627 | } else { |
629 | cache->cached = BTRFS_CACHE_STARTED; | 628 | cache->cached = BTRFS_CACHE_STARTED; |
629 | cache->has_caching_ctl = 1; | ||
630 | } | 630 | } |
631 | spin_unlock(&cache->lock); | 631 | spin_unlock(&cache->lock); |
632 | wake_up(&caching_ctl->wait); | 632 | wake_up(&caching_ctl->wait); |
@@ -3162,7 +3162,19 @@ next_block_group(struct btrfs_root *root, | |||
3162 | struct btrfs_block_group_cache *cache) | 3162 | struct btrfs_block_group_cache *cache) |
3163 | { | 3163 | { |
3164 | struct rb_node *node; | 3164 | struct rb_node *node; |
3165 | |||
3165 | spin_lock(&root->fs_info->block_group_cache_lock); | 3166 | spin_lock(&root->fs_info->block_group_cache_lock); |
3167 | |||
3168 | /* If our block group was removed, we need a full search. */ | ||
3169 | if (RB_EMPTY_NODE(&cache->cache_node)) { | ||
3170 | const u64 next_bytenr = cache->key.objectid + cache->key.offset; | ||
3171 | |||
3172 | spin_unlock(&root->fs_info->block_group_cache_lock); | ||
3173 | btrfs_put_block_group(cache); | ||
3174 | cache = btrfs_lookup_first_block_group(root->fs_info, | ||
3175 | next_bytenr); | ||
3176 | return cache; | ||
3177 | } | ||
3166 | node = rb_next(&cache->cache_node); | 3178 | node = rb_next(&cache->cache_node); |
3167 | btrfs_put_block_group(cache); | 3179 | btrfs_put_block_group(cache); |
3168 | if (node) { | 3180 | if (node) { |
@@ -3504,6 +3516,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, | |||
3504 | found->chunk_alloc = 0; | 3516 | found->chunk_alloc = 0; |
3505 | found->flush = 0; | 3517 | found->flush = 0; |
3506 | init_waitqueue_head(&found->wait); | 3518 | init_waitqueue_head(&found->wait); |
3519 | INIT_LIST_HEAD(&found->ro_bgs); | ||
3507 | 3520 | ||
3508 | ret = kobject_init_and_add(&found->kobj, &space_info_ktype, | 3521 | ret = kobject_init_and_add(&found->kobj, &space_info_ktype, |
3509 | info->space_info_kobj, "%s", | 3522 | info->space_info_kobj, "%s", |
@@ -5425,7 +5438,17 @@ static int update_block_group(struct btrfs_root *root, | |||
5425 | spin_unlock(&cache->space_info->lock); | 5438 | spin_unlock(&cache->space_info->lock); |
5426 | } else { | 5439 | } else { |
5427 | old_val -= num_bytes; | 5440 | old_val -= num_bytes; |
5441 | btrfs_set_block_group_used(&cache->item, old_val); | ||
5442 | cache->pinned += num_bytes; | ||
5443 | cache->space_info->bytes_pinned += num_bytes; | ||
5444 | cache->space_info->bytes_used -= num_bytes; | ||
5445 | cache->space_info->disk_used -= num_bytes * factor; | ||
5446 | spin_unlock(&cache->lock); | ||
5447 | spin_unlock(&cache->space_info->lock); | ||
5428 | 5448 | ||
5449 | set_extent_dirty(info->pinned_extents, | ||
5450 | bytenr, bytenr + num_bytes - 1, | ||
5451 | GFP_NOFS | __GFP_NOFAIL); | ||
5429 | /* | 5452 | /* |
5430 | * No longer have used bytes in this block group, queue | 5453 | * No longer have used bytes in this block group, queue |
5431 | * it for deletion. | 5454 | * it for deletion. |
@@ -5439,17 +5462,6 @@ static int update_block_group(struct btrfs_root *root, | |||
5439 | } | 5462 | } |
5440 | spin_unlock(&info->unused_bgs_lock); | 5463 | spin_unlock(&info->unused_bgs_lock); |
5441 | } | 5464 | } |
5442 | btrfs_set_block_group_used(&cache->item, old_val); | ||
5443 | cache->pinned += num_bytes; | ||
5444 | cache->space_info->bytes_pinned += num_bytes; | ||
5445 | cache->space_info->bytes_used -= num_bytes; | ||
5446 | cache->space_info->disk_used -= num_bytes * factor; | ||
5447 | spin_unlock(&cache->lock); | ||
5448 | spin_unlock(&cache->space_info->lock); | ||
5449 | |||
5450 | set_extent_dirty(info->pinned_extents, | ||
5451 | bytenr, bytenr + num_bytes - 1, | ||
5452 | GFP_NOFS | __GFP_NOFAIL); | ||
5453 | } | 5465 | } |
5454 | btrfs_put_block_group(cache); | 5466 | btrfs_put_block_group(cache); |
5455 | total -= num_bytes; | 5467 | total -= num_bytes; |
@@ -8511,6 +8523,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) | |||
8511 | min_allocable_bytes <= sinfo->total_bytes) { | 8523 | min_allocable_bytes <= sinfo->total_bytes) { |
8512 | sinfo->bytes_readonly += num_bytes; | 8524 | sinfo->bytes_readonly += num_bytes; |
8513 | cache->ro = 1; | 8525 | cache->ro = 1; |
8526 | list_add_tail(&cache->ro_list, &sinfo->ro_bgs); | ||
8514 | ret = 0; | 8527 | ret = 0; |
8515 | } | 8528 | } |
8516 | out: | 8529 | out: |
@@ -8565,15 +8578,20 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, | |||
8565 | 8578 | ||
8566 | /* | 8579 | /* |
8567 | * helper to account the unused space of all the readonly block group in the | 8580 | * helper to account the unused space of all the readonly block group in the |
8568 | * list. takes mirrors into account. | 8581 | * space_info. takes mirrors into account. |
8569 | */ | 8582 | */ |
8570 | static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) | 8583 | u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) |
8571 | { | 8584 | { |
8572 | struct btrfs_block_group_cache *block_group; | 8585 | struct btrfs_block_group_cache *block_group; |
8573 | u64 free_bytes = 0; | 8586 | u64 free_bytes = 0; |
8574 | int factor; | 8587 | int factor; |
8575 | 8588 | ||
8576 | list_for_each_entry(block_group, groups_list, list) { | 8589 | /* It's df, we don't care if it's racey */ |
8590 | if (list_empty(&sinfo->ro_bgs)) | ||
8591 | return 0; | ||
8592 | |||
8593 | spin_lock(&sinfo->lock); | ||
8594 | list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { | ||
8577 | spin_lock(&block_group->lock); | 8595 | spin_lock(&block_group->lock); |
8578 | 8596 | ||
8579 | if (!block_group->ro) { | 8597 | if (!block_group->ro) { |
@@ -8594,26 +8612,6 @@ static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list) | |||
8594 | 8612 | ||
8595 | spin_unlock(&block_group->lock); | 8613 | spin_unlock(&block_group->lock); |
8596 | } | 8614 | } |
8597 | |||
8598 | return free_bytes; | ||
8599 | } | ||
8600 | |||
8601 | /* | ||
8602 | * helper to account the unused space of all the readonly block group in the | ||
8603 | * space_info. takes mirrors into account. | ||
8604 | */ | ||
8605 | u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) | ||
8606 | { | ||
8607 | int i; | ||
8608 | u64 free_bytes = 0; | ||
8609 | |||
8610 | spin_lock(&sinfo->lock); | ||
8611 | |||
8612 | for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) | ||
8613 | if (!list_empty(&sinfo->block_groups[i])) | ||
8614 | free_bytes += __btrfs_get_ro_block_group_free_space( | ||
8615 | &sinfo->block_groups[i]); | ||
8616 | |||
8617 | spin_unlock(&sinfo->lock); | 8615 | spin_unlock(&sinfo->lock); |
8618 | 8616 | ||
8619 | return free_bytes; | 8617 | return free_bytes; |
@@ -8633,6 +8631,7 @@ void btrfs_set_block_group_rw(struct btrfs_root *root, | |||
8633 | cache->bytes_super - btrfs_block_group_used(&cache->item); | 8631 | cache->bytes_super - btrfs_block_group_used(&cache->item); |
8634 | sinfo->bytes_readonly -= num_bytes; | 8632 | sinfo->bytes_readonly -= num_bytes; |
8635 | cache->ro = 0; | 8633 | cache->ro = 0; |
8634 | list_del_init(&cache->ro_list); | ||
8636 | spin_unlock(&cache->lock); | 8635 | spin_unlock(&cache->lock); |
8637 | spin_unlock(&sinfo->lock); | 8636 | spin_unlock(&sinfo->lock); |
8638 | } | 8637 | } |
@@ -9002,7 +9001,9 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) | |||
9002 | INIT_LIST_HEAD(&cache->list); | 9001 | INIT_LIST_HEAD(&cache->list); |
9003 | INIT_LIST_HEAD(&cache->cluster_list); | 9002 | INIT_LIST_HEAD(&cache->cluster_list); |
9004 | INIT_LIST_HEAD(&cache->bg_list); | 9003 | INIT_LIST_HEAD(&cache->bg_list); |
9004 | INIT_LIST_HEAD(&cache->ro_list); | ||
9005 | btrfs_init_free_space_ctl(cache); | 9005 | btrfs_init_free_space_ctl(cache); |
9006 | atomic_set(&cache->trimming, 0); | ||
9006 | 9007 | ||
9007 | return cache; | 9008 | return cache; |
9008 | } | 9009 | } |
@@ -9195,9 +9196,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, | |||
9195 | int ret = 0; | 9196 | int ret = 0; |
9196 | 9197 | ||
9197 | list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { | 9198 | list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { |
9198 | list_del_init(&block_group->bg_list); | ||
9199 | if (ret) | 9199 | if (ret) |
9200 | continue; | 9200 | goto next; |
9201 | 9201 | ||
9202 | spin_lock(&block_group->lock); | 9202 | spin_lock(&block_group->lock); |
9203 | memcpy(&item, &block_group->item, sizeof(item)); | 9203 | memcpy(&item, &block_group->item, sizeof(item)); |
@@ -9212,6 +9212,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, | |||
9212 | key.objectid, key.offset); | 9212 | key.objectid, key.offset); |
9213 | if (ret) | 9213 | if (ret) |
9214 | btrfs_abort_transaction(trans, extent_root, ret); | 9214 | btrfs_abort_transaction(trans, extent_root, ret); |
9215 | next: | ||
9216 | list_del_init(&block_group->bg_list); | ||
9215 | } | 9217 | } |
9216 | } | 9218 | } |
9217 | 9219 | ||
@@ -9304,7 +9306,8 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
9304 | } | 9306 | } |
9305 | 9307 | ||
9306 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 9308 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
9307 | struct btrfs_root *root, u64 group_start) | 9309 | struct btrfs_root *root, u64 group_start, |
9310 | struct extent_map *em) | ||
9308 | { | 9311 | { |
9309 | struct btrfs_path *path; | 9312 | struct btrfs_path *path; |
9310 | struct btrfs_block_group_cache *block_group; | 9313 | struct btrfs_block_group_cache *block_group; |
@@ -9316,6 +9319,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
9316 | int ret; | 9319 | int ret; |
9317 | int index; | 9320 | int index; |
9318 | int factor; | 9321 | int factor; |
9322 | struct btrfs_caching_control *caching_ctl = NULL; | ||
9323 | bool remove_em; | ||
9319 | 9324 | ||
9320 | root = root->fs_info->extent_root; | 9325 | root = root->fs_info->extent_root; |
9321 | 9326 | ||
@@ -9400,6 +9405,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
9400 | spin_lock(&root->fs_info->block_group_cache_lock); | 9405 | spin_lock(&root->fs_info->block_group_cache_lock); |
9401 | rb_erase(&block_group->cache_node, | 9406 | rb_erase(&block_group->cache_node, |
9402 | &root->fs_info->block_group_cache_tree); | 9407 | &root->fs_info->block_group_cache_tree); |
9408 | RB_CLEAR_NODE(&block_group->cache_node); | ||
9403 | 9409 | ||
9404 | if (root->fs_info->first_logical_byte == block_group->key.objectid) | 9410 | if (root->fs_info->first_logical_byte == block_group->key.objectid) |
9405 | root->fs_info->first_logical_byte = (u64)-1; | 9411 | root->fs_info->first_logical_byte = (u64)-1; |
@@ -9411,6 +9417,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
9411 | * are still on the list after taking the semaphore | 9417 | * are still on the list after taking the semaphore |
9412 | */ | 9418 | */ |
9413 | list_del_init(&block_group->list); | 9419 | list_del_init(&block_group->list); |
9420 | list_del_init(&block_group->ro_list); | ||
9414 | if (list_empty(&block_group->space_info->block_groups[index])) { | 9421 | if (list_empty(&block_group->space_info->block_groups[index])) { |
9415 | kobj = block_group->space_info->block_group_kobjs[index]; | 9422 | kobj = block_group->space_info->block_group_kobjs[index]; |
9416 | block_group->space_info->block_group_kobjs[index] = NULL; | 9423 | block_group->space_info->block_group_kobjs[index] = NULL; |
@@ -9422,8 +9429,32 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
9422 | kobject_put(kobj); | 9429 | kobject_put(kobj); |
9423 | } | 9430 | } |
9424 | 9431 | ||
9432 | if (block_group->has_caching_ctl) | ||
9433 | caching_ctl = get_caching_control(block_group); | ||
9425 | if (block_group->cached == BTRFS_CACHE_STARTED) | 9434 | if (block_group->cached == BTRFS_CACHE_STARTED) |
9426 | wait_block_group_cache_done(block_group); | 9435 | wait_block_group_cache_done(block_group); |
9436 | if (block_group->has_caching_ctl) { | ||
9437 | down_write(&root->fs_info->commit_root_sem); | ||
9438 | if (!caching_ctl) { | ||
9439 | struct btrfs_caching_control *ctl; | ||
9440 | |||
9441 | list_for_each_entry(ctl, | ||
9442 | &root->fs_info->caching_block_groups, list) | ||
9443 | if (ctl->block_group == block_group) { | ||
9444 | caching_ctl = ctl; | ||
9445 | atomic_inc(&caching_ctl->count); | ||
9446 | break; | ||
9447 | } | ||
9448 | } | ||
9449 | if (caching_ctl) | ||
9450 | list_del_init(&caching_ctl->list); | ||
9451 | up_write(&root->fs_info->commit_root_sem); | ||
9452 | if (caching_ctl) { | ||
9453 | /* Once for the caching bgs list and once for us. */ | ||
9454 | put_caching_control(caching_ctl); | ||
9455 | put_caching_control(caching_ctl); | ||
9456 | } | ||
9457 | } | ||
9427 | 9458 | ||
9428 | btrfs_remove_free_space_cache(block_group); | 9459 | btrfs_remove_free_space_cache(block_group); |
9429 | 9460 | ||
@@ -9435,6 +9466,71 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
9435 | 9466 | ||
9436 | memcpy(&key, &block_group->key, sizeof(key)); | 9467 | memcpy(&key, &block_group->key, sizeof(key)); |
9437 | 9468 | ||
9469 | lock_chunks(root); | ||
9470 | if (!list_empty(&em->list)) { | ||
9471 | /* We're in the transaction->pending_chunks list. */ | ||
9472 | free_extent_map(em); | ||
9473 | } | ||
9474 | spin_lock(&block_group->lock); | ||
9475 | block_group->removed = 1; | ||
9476 | /* | ||
9477 | * At this point trimming can't start on this block group, because we | ||
9478 | * removed the block group from the tree fs_info->block_group_cache_tree | ||
9479 | * so no one can't find it anymore and even if someone already got this | ||
9480 | * block group before we removed it from the rbtree, they have already | ||
9481 | * incremented block_group->trimming - if they didn't, they won't find | ||
9482 | * any free space entries because we already removed them all when we | ||
9483 | * called btrfs_remove_free_space_cache(). | ||
9484 | * | ||
9485 | * And we must not remove the extent map from the fs_info->mapping_tree | ||
9486 | * to prevent the same logical address range and physical device space | ||
9487 | * ranges from being reused for a new block group. This is because our | ||
9488 | * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is | ||
9489 | * completely transactionless, so while it is trimming a range the | ||
9490 | * currently running transaction might finish and a new one start, | ||
9491 | * allowing for new block groups to be created that can reuse the same | ||
9492 | * physical device locations unless we take this special care. | ||
9493 | */ | ||
9494 | remove_em = (atomic_read(&block_group->trimming) == 0); | ||
9495 | /* | ||
9496 | * Make sure a trimmer task always sees the em in the pinned_chunks list | ||
9497 | * if it sees block_group->removed == 1 (needs to lock block_group->lock | ||
9498 | * before checking block_group->removed). | ||
9499 | */ | ||
9500 | if (!remove_em) { | ||
9501 | /* | ||
9502 | * Our em might be in trans->transaction->pending_chunks which | ||
9503 | * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks), | ||
9504 | * and so is the fs_info->pinned_chunks list. | ||
9505 | * | ||
9506 | * So at this point we must be holding the chunk_mutex to avoid | ||
9507 | * any races with chunk allocation (more specifically at | ||
9508 | * volumes.c:contains_pending_extent()), to ensure it always | ||
9509 | * sees the em, either in the pending_chunks list or in the | ||
9510 | * pinned_chunks list. | ||
9511 | */ | ||
9512 | list_move_tail(&em->list, &root->fs_info->pinned_chunks); | ||
9513 | } | ||
9514 | spin_unlock(&block_group->lock); | ||
9515 | |||
9516 | if (remove_em) { | ||
9517 | struct extent_map_tree *em_tree; | ||
9518 | |||
9519 | em_tree = &root->fs_info->mapping_tree.map_tree; | ||
9520 | write_lock(&em_tree->lock); | ||
9521 | /* | ||
9522 | * The em might be in the pending_chunks list, so make sure the | ||
9523 | * chunk mutex is locked, since remove_extent_mapping() will | ||
9524 | * delete us from that list. | ||
9525 | */ | ||
9526 | remove_extent_mapping(em_tree, em); | ||
9527 | write_unlock(&em_tree->lock); | ||
9528 | /* once for the tree */ | ||
9529 | free_extent_map(em); | ||
9530 | } | ||
9531 | |||
9532 | unlock_chunks(root); | ||
9533 | |||
9438 | btrfs_put_block_group(block_group); | 9534 | btrfs_put_block_group(block_group); |
9439 | btrfs_put_block_group(block_group); | 9535 | btrfs_put_block_group(block_group); |
9440 | 9536 | ||
@@ -9523,10 +9619,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) | |||
9523 | */ | 9619 | */ |
9524 | start = block_group->key.objectid; | 9620 | start = block_group->key.objectid; |
9525 | end = start + block_group->key.offset - 1; | 9621 | end = start + block_group->key.offset - 1; |
9526 | clear_extent_bits(&fs_info->freed_extents[0], start, end, | 9622 | ret = clear_extent_bits(&fs_info->freed_extents[0], start, end, |
9527 | EXTENT_DIRTY, GFP_NOFS); | 9623 | EXTENT_DIRTY, GFP_NOFS); |
9528 | clear_extent_bits(&fs_info->freed_extents[1], start, end, | 9624 | if (ret) { |
9625 | btrfs_set_block_group_rw(root, block_group); | ||
9626 | goto end_trans; | ||
9627 | } | ||
9628 | ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, | ||
9529 | EXTENT_DIRTY, GFP_NOFS); | 9629 | EXTENT_DIRTY, GFP_NOFS); |
9630 | if (ret) { | ||
9631 | btrfs_set_block_group_rw(root, block_group); | ||
9632 | goto end_trans; | ||
9633 | } | ||
9530 | 9634 | ||
9531 | /* Reset pinned so btrfs_put_block_group doesn't complain */ | 9635 | /* Reset pinned so btrfs_put_block_group doesn't complain */ |
9532 | block_group->pinned = 0; | 9636 | block_group->pinned = 0; |
@@ -9537,6 +9641,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) | |||
9537 | */ | 9641 | */ |
9538 | ret = btrfs_remove_chunk(trans, root, | 9642 | ret = btrfs_remove_chunk(trans, root, |
9539 | block_group->key.objectid); | 9643 | block_group->key.objectid); |
9644 | end_trans: | ||
9540 | btrfs_end_transaction(trans, root); | 9645 | btrfs_end_transaction(trans, root); |
9541 | next: | 9646 | next: |
9542 | btrfs_put_block_group(block_group); | 9647 | btrfs_put_block_group(block_group); |
@@ -9657,12 +9762,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) | |||
9657 | } | 9762 | } |
9658 | 9763 | ||
9659 | /* | 9764 | /* |
9660 | * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(), | 9765 | * btrfs_{start,end}_write_no_snapshoting() are similar to |
9661 | * they are used to prevent the some tasks writing data into the page cache | 9766 | * mnt_{want,drop}_write(), they are used to prevent some tasks from writing |
9662 | * by nocow before the subvolume is snapshoted, but flush the data into | 9767 | * data into the page cache through nocow before the subvolume is snapshoted, |
9663 | * the disk after the snapshot creation. | 9768 | * but flush the data into disk after the snapshot creation, or to prevent |
9769 | * operations while snapshoting is ongoing and that cause the snapshot to be | ||
9770 | * inconsistent (writes followed by expanding truncates for example). | ||
9664 | */ | 9771 | */ |
9665 | void btrfs_end_nocow_write(struct btrfs_root *root) | 9772 | void btrfs_end_write_no_snapshoting(struct btrfs_root *root) |
9666 | { | 9773 | { |
9667 | percpu_counter_dec(&root->subv_writers->counter); | 9774 | percpu_counter_dec(&root->subv_writers->counter); |
9668 | /* | 9775 | /* |
@@ -9674,7 +9781,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) | |||
9674 | wake_up(&root->subv_writers->wait); | 9781 | wake_up(&root->subv_writers->wait); |
9675 | } | 9782 | } |
9676 | 9783 | ||
9677 | int btrfs_start_nocow_write(struct btrfs_root *root) | 9784 | int btrfs_start_write_no_snapshoting(struct btrfs_root *root) |
9678 | { | 9785 | { |
9679 | if (atomic_read(&root->will_be_snapshoted)) | 9786 | if (atomic_read(&root->will_be_snapshoted)) |
9680 | return 0; | 9787 | return 0; |
@@ -9685,7 +9792,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) | |||
9685 | */ | 9792 | */ |
9686 | smp_mb(); | 9793 | smp_mb(); |
9687 | if (atomic_read(&root->will_be_snapshoted)) { | 9794 | if (atomic_read(&root->will_be_snapshoted)) { |
9688 | btrfs_end_nocow_write(root); | 9795 | btrfs_end_write_no_snapshoting(root); |
9689 | return 0; | 9796 | return 0; |
9690 | } | 9797 | } |
9691 | return 1; | 9798 | return 1; |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf3f424e0013..4ebabd237153 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -595,9 +595,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | |||
595 | clear = 1; | 595 | clear = 1; |
596 | again: | 596 | again: |
597 | if (!prealloc && (mask & __GFP_WAIT)) { | 597 | if (!prealloc && (mask & __GFP_WAIT)) { |
598 | /* | ||
599 | * Don't care for allocation failure here because we might end | ||
600 | * up not needing the pre-allocated extent state at all, which | ||
601 | * is the case if we only have in the tree extent states that | ||
602 | * cover our input range and don't cover too any other range. | ||
603 | * If we end up needing a new extent state we allocate it later. | ||
604 | */ | ||
598 | prealloc = alloc_extent_state(mask); | 605 | prealloc = alloc_extent_state(mask); |
599 | if (!prealloc) | ||
600 | return -ENOMEM; | ||
601 | } | 606 | } |
602 | 607 | ||
603 | spin_lock(&tree->lock); | 608 | spin_lock(&tree->lock); |
@@ -796,17 +801,25 @@ static void set_state_bits(struct extent_io_tree *tree, | |||
796 | state->state |= bits_to_set; | 801 | state->state |= bits_to_set; |
797 | } | 802 | } |
798 | 803 | ||
799 | static void cache_state(struct extent_state *state, | 804 | static void cache_state_if_flags(struct extent_state *state, |
800 | struct extent_state **cached_ptr) | 805 | struct extent_state **cached_ptr, |
806 | const u64 flags) | ||
801 | { | 807 | { |
802 | if (cached_ptr && !(*cached_ptr)) { | 808 | if (cached_ptr && !(*cached_ptr)) { |
803 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) { | 809 | if (!flags || (state->state & flags)) { |
804 | *cached_ptr = state; | 810 | *cached_ptr = state; |
805 | atomic_inc(&state->refs); | 811 | atomic_inc(&state->refs); |
806 | } | 812 | } |
807 | } | 813 | } |
808 | } | 814 | } |
809 | 815 | ||
816 | static void cache_state(struct extent_state *state, | ||
817 | struct extent_state **cached_ptr) | ||
818 | { | ||
819 | return cache_state_if_flags(state, cached_ptr, | ||
820 | EXTENT_IOBITS | EXTENT_BOUNDARY); | ||
821 | } | ||
822 | |||
810 | /* | 823 | /* |
811 | * set some bits on a range in the tree. This may require allocations or | 824 | * set some bits on a range in the tree. This may require allocations or |
812 | * sleeping, so the gfp mask is used to indicate what is allowed. | 825 | * sleeping, so the gfp mask is used to indicate what is allowed. |
@@ -1058,13 +1071,21 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | |||
1058 | int err = 0; | 1071 | int err = 0; |
1059 | u64 last_start; | 1072 | u64 last_start; |
1060 | u64 last_end; | 1073 | u64 last_end; |
1074 | bool first_iteration = true; | ||
1061 | 1075 | ||
1062 | btrfs_debug_check_extent_io_range(tree, start, end); | 1076 | btrfs_debug_check_extent_io_range(tree, start, end); |
1063 | 1077 | ||
1064 | again: | 1078 | again: |
1065 | if (!prealloc && (mask & __GFP_WAIT)) { | 1079 | if (!prealloc && (mask & __GFP_WAIT)) { |
1080 | /* | ||
1081 | * Best effort, don't worry if extent state allocation fails | ||
1082 | * here for the first iteration. We might have a cached state | ||
1083 | * that matches exactly the target range, in which case no | ||
1084 | * extent state allocations are needed. We'll only know this | ||
1085 | * after locking the tree. | ||
1086 | */ | ||
1066 | prealloc = alloc_extent_state(mask); | 1087 | prealloc = alloc_extent_state(mask); |
1067 | if (!prealloc) | 1088 | if (!prealloc && !first_iteration) |
1068 | return -ENOMEM; | 1089 | return -ENOMEM; |
1069 | } | 1090 | } |
1070 | 1091 | ||
@@ -1234,6 +1255,7 @@ search_again: | |||
1234 | spin_unlock(&tree->lock); | 1255 | spin_unlock(&tree->lock); |
1235 | if (mask & __GFP_WAIT) | 1256 | if (mask & __GFP_WAIT) |
1236 | cond_resched(); | 1257 | cond_resched(); |
1258 | first_iteration = false; | ||
1237 | goto again; | 1259 | goto again; |
1238 | } | 1260 | } |
1239 | 1261 | ||
@@ -1482,7 +1504,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | |||
1482 | state = find_first_extent_bit_state(tree, start, bits); | 1504 | state = find_first_extent_bit_state(tree, start, bits); |
1483 | got_it: | 1505 | got_it: |
1484 | if (state) { | 1506 | if (state) { |
1485 | cache_state(state, cached_state); | 1507 | cache_state_if_flags(state, cached_state, 0); |
1486 | *start_ret = state->start; | 1508 | *start_ret = state->start; |
1487 | *end_ret = state->end; | 1509 | *end_ret = state->end; |
1488 | ret = 0; | 1510 | ret = 0; |
@@ -1746,6 +1768,9 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, | |||
1746 | if (page_ops == 0) | 1768 | if (page_ops == 0) |
1747 | return 0; | 1769 | return 0; |
1748 | 1770 | ||
1771 | if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) | ||
1772 | mapping_set_error(inode->i_mapping, -EIO); | ||
1773 | |||
1749 | while (nr_pages > 0) { | 1774 | while (nr_pages > 0) { |
1750 | ret = find_get_pages_contig(inode->i_mapping, index, | 1775 | ret = find_get_pages_contig(inode->i_mapping, index, |
1751 | min_t(unsigned long, | 1776 | min_t(unsigned long, |
@@ -1763,6 +1788,8 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, | |||
1763 | clear_page_dirty_for_io(pages[i]); | 1788 | clear_page_dirty_for_io(pages[i]); |
1764 | if (page_ops & PAGE_SET_WRITEBACK) | 1789 | if (page_ops & PAGE_SET_WRITEBACK) |
1765 | set_page_writeback(pages[i]); | 1790 | set_page_writeback(pages[i]); |
1791 | if (page_ops & PAGE_SET_ERROR) | ||
1792 | SetPageError(pages[i]); | ||
1766 | if (page_ops & PAGE_END_WRITEBACK) | 1793 | if (page_ops & PAGE_END_WRITEBACK) |
1767 | end_page_writeback(pages[i]); | 1794 | end_page_writeback(pages[i]); |
1768 | if (page_ops & PAGE_UNLOCK) | 1795 | if (page_ops & PAGE_UNLOCK) |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6d4b938be986..ece9ce87edff 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -49,6 +49,7 @@ | |||
49 | #define PAGE_SET_WRITEBACK (1 << 2) | 49 | #define PAGE_SET_WRITEBACK (1 << 2) |
50 | #define PAGE_END_WRITEBACK (1 << 3) | 50 | #define PAGE_END_WRITEBACK (1 << 3) |
51 | #define PAGE_SET_PRIVATE2 (1 << 4) | 51 | #define PAGE_SET_PRIVATE2 (1 << 4) |
52 | #define PAGE_SET_ERROR (1 << 5) | ||
52 | 53 | ||
53 | /* | 54 | /* |
54 | * page->private values. Every page that is controlled by the extent | 55 | * page->private values. Every page that is controlled by the extent |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 225302b39afb..6a98bddd8f33 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
@@ -287,8 +287,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, | |||
287 | if (!em) | 287 | if (!em) |
288 | goto out; | 288 | goto out; |
289 | 289 | ||
290 | if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) | ||
291 | list_move(&em->list, &tree->modified_extents); | ||
292 | em->generation = gen; | 290 | em->generation = gen; |
293 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | 291 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); |
294 | em->mod_start = em->start; | 292 | em->mod_start = em->start; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a18ceabd99a8..e4090259569b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, | |||
1428 | u64 num_bytes; | 1428 | u64 num_bytes; |
1429 | int ret; | 1429 | int ret; |
1430 | 1430 | ||
1431 | ret = btrfs_start_nocow_write(root); | 1431 | ret = btrfs_start_write_no_snapshoting(root); |
1432 | if (!ret) | 1432 | if (!ret) |
1433 | return -ENOSPC; | 1433 | return -ENOSPC; |
1434 | 1434 | ||
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, | |||
1451 | ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); | 1451 | ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL); |
1452 | if (ret <= 0) { | 1452 | if (ret <= 0) { |
1453 | ret = 0; | 1453 | ret = 0; |
1454 | btrfs_end_nocow_write(root); | 1454 | btrfs_end_write_no_snapshoting(root); |
1455 | } else { | 1455 | } else { |
1456 | *write_bytes = min_t(size_t, *write_bytes , | 1456 | *write_bytes = min_t(size_t, *write_bytes , |
1457 | num_bytes - pos + lockstart); | 1457 | num_bytes - pos + lockstart); |
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1543 | btrfs_free_reserved_data_space(inode, | 1543 | btrfs_free_reserved_data_space(inode, |
1544 | reserve_bytes); | 1544 | reserve_bytes); |
1545 | else | 1545 | else |
1546 | btrfs_end_nocow_write(root); | 1546 | btrfs_end_write_no_snapshoting(root); |
1547 | break; | 1547 | break; |
1548 | } | 1548 | } |
1549 | 1549 | ||
@@ -1632,7 +1632,7 @@ again: | |||
1632 | 1632 | ||
1633 | release_bytes = 0; | 1633 | release_bytes = 0; |
1634 | if (only_release_metadata) | 1634 | if (only_release_metadata) |
1635 | btrfs_end_nocow_write(root); | 1635 | btrfs_end_write_no_snapshoting(root); |
1636 | 1636 | ||
1637 | if (only_release_metadata && copied > 0) { | 1637 | if (only_release_metadata && copied > 0) { |
1638 | u64 lockstart = round_down(pos, root->sectorsize); | 1638 | u64 lockstart = round_down(pos, root->sectorsize); |
@@ -1661,7 +1661,7 @@ again: | |||
1661 | 1661 | ||
1662 | if (release_bytes) { | 1662 | if (release_bytes) { |
1663 | if (only_release_metadata) { | 1663 | if (only_release_metadata) { |
1664 | btrfs_end_nocow_write(root); | 1664 | btrfs_end_write_no_snapshoting(root); |
1665 | btrfs_delalloc_release_metadata(inode, release_bytes); | 1665 | btrfs_delalloc_release_metadata(inode, release_bytes); |
1666 | } else { | 1666 | } else { |
1667 | btrfs_delalloc_release_space(inode, release_bytes); | 1667 | btrfs_delalloc_release_space(inode, release_bytes); |
@@ -1676,6 +1676,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, | |||
1676 | loff_t pos) | 1676 | loff_t pos) |
1677 | { | 1677 | { |
1678 | struct file *file = iocb->ki_filp; | 1678 | struct file *file = iocb->ki_filp; |
1679 | struct inode *inode = file_inode(file); | ||
1679 | ssize_t written; | 1680 | ssize_t written; |
1680 | ssize_t written_buffered; | 1681 | ssize_t written_buffered; |
1681 | loff_t endbyte; | 1682 | loff_t endbyte; |
@@ -1692,8 +1693,15 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, | |||
1692 | err = written_buffered; | 1693 | err = written_buffered; |
1693 | goto out; | 1694 | goto out; |
1694 | } | 1695 | } |
1696 | /* | ||
1697 | * Ensure all data is persisted. We want the next direct IO read to be | ||
1698 | * able to read what was just written. | ||
1699 | */ | ||
1695 | endbyte = pos + written_buffered - 1; | 1700 | endbyte = pos + written_buffered - 1; |
1696 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); | 1701 | err = btrfs_fdatawrite_range(inode, pos, endbyte); |
1702 | if (err) | ||
1703 | goto out; | ||
1704 | err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte); | ||
1697 | if (err) | 1705 | if (err) |
1698 | goto out; | 1706 | goto out; |
1699 | written += written_buffered; | 1707 | written += written_buffered; |
@@ -1854,10 +1862,7 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) | |||
1854 | int ret; | 1862 | int ret; |
1855 | 1863 | ||
1856 | atomic_inc(&BTRFS_I(inode)->sync_writers); | 1864 | atomic_inc(&BTRFS_I(inode)->sync_writers); |
1857 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | 1865 | ret = btrfs_fdatawrite_range(inode, start, end); |
1858 | if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, | ||
1859 | &BTRFS_I(inode)->runtime_flags)) | ||
1860 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | ||
1861 | atomic_dec(&BTRFS_I(inode)->sync_writers); | 1866 | atomic_dec(&BTRFS_I(inode)->sync_writers); |
1862 | 1867 | ||
1863 | return ret; | 1868 | return ret; |
@@ -2810,3 +2815,29 @@ int btrfs_auto_defrag_init(void) | |||
2810 | 2815 | ||
2811 | return 0; | 2816 | return 0; |
2812 | } | 2817 | } |
2818 | |||
2819 | int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) | ||
2820 | { | ||
2821 | int ret; | ||
2822 | |||
2823 | /* | ||
2824 | * So with compression we will find and lock a dirty page and clear the | ||
2825 | * first one as dirty, setup an async extent, and immediately return | ||
2826 | * with the entire range locked but with nobody actually marked with | ||
2827 | * writeback. So we can't just filemap_write_and_wait_range() and | ||
2828 | * expect it to work since it will just kick off a thread to do the | ||
2829 | * actual work. So we need to call filemap_fdatawrite_range _again_ | ||
2830 | * since it will wait on the page lock, which won't be unlocked until | ||
2831 | * after the pages have been marked as writeback and so we're good to go | ||
2832 | * from there. We have to do this otherwise we'll miss the ordered | ||
2833 | * extents and that results in badness. Please Josef, do not think you | ||
2834 | * know better and pull this out at some point in the future, it is | ||
2835 | * right and you are wrong. | ||
2836 | */ | ||
2837 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | ||
2838 | if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, | ||
2839 | &BTRFS_I(inode)->runtime_flags)) | ||
2840 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | ||
2841 | |||
2842 | return ret; | ||
2843 | } | ||
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 33848196550e..030847bf7cec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
@@ -27,10 +27,17 @@ | |||
27 | #include "disk-io.h" | 27 | #include "disk-io.h" |
28 | #include "extent_io.h" | 28 | #include "extent_io.h" |
29 | #include "inode-map.h" | 29 | #include "inode-map.h" |
30 | #include "volumes.h" | ||
30 | 31 | ||
31 | #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) | 32 | #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) |
32 | #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) | 33 | #define MAX_CACHE_BYTES_PER_GIG (32 * 1024) |
33 | 34 | ||
35 | struct btrfs_trim_range { | ||
36 | u64 start; | ||
37 | u64 bytes; | ||
38 | struct list_head list; | ||
39 | }; | ||
40 | |||
34 | static int link_free_space(struct btrfs_free_space_ctl *ctl, | 41 | static int link_free_space(struct btrfs_free_space_ctl *ctl, |
35 | struct btrfs_free_space *info); | 42 | struct btrfs_free_space *info); |
36 | static void unlink_free_space(struct btrfs_free_space_ctl *ctl, | 43 | static void unlink_free_space(struct btrfs_free_space_ctl *ctl, |
@@ -881,6 +888,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, | |||
881 | int ret; | 888 | int ret; |
882 | struct btrfs_free_cluster *cluster = NULL; | 889 | struct btrfs_free_cluster *cluster = NULL; |
883 | struct rb_node *node = rb_first(&ctl->free_space_offset); | 890 | struct rb_node *node = rb_first(&ctl->free_space_offset); |
891 | struct btrfs_trim_range *trim_entry; | ||
884 | 892 | ||
885 | /* Get the cluster for this block_group if it exists */ | 893 | /* Get the cluster for this block_group if it exists */ |
886 | if (block_group && !list_empty(&block_group->cluster_list)) { | 894 | if (block_group && !list_empty(&block_group->cluster_list)) { |
@@ -916,6 +924,21 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, | |||
916 | cluster = NULL; | 924 | cluster = NULL; |
917 | } | 925 | } |
918 | } | 926 | } |
927 | |||
928 | /* | ||
929 | * Make sure we don't miss any range that was removed from our rbtree | ||
930 | * because trimming is running. Otherwise after a umount+mount (or crash | ||
931 | * after committing the transaction) we would leak free space and get | ||
932 | * an inconsistent free space cache report from fsck. | ||
933 | */ | ||
934 | list_for_each_entry(trim_entry, &ctl->trimming_ranges, list) { | ||
935 | ret = io_ctl_add_entry(io_ctl, trim_entry->start, | ||
936 | trim_entry->bytes, NULL); | ||
937 | if (ret) | ||
938 | goto fail; | ||
939 | *entries += 1; | ||
940 | } | ||
941 | |||
919 | return 0; | 942 | return 0; |
920 | fail: | 943 | fail: |
921 | return -ENOSPC; | 944 | return -ENOSPC; |
@@ -1135,12 +1158,15 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
1135 | 1158 | ||
1136 | io_ctl_set_generation(&io_ctl, trans->transid); | 1159 | io_ctl_set_generation(&io_ctl, trans->transid); |
1137 | 1160 | ||
1161 | mutex_lock(&ctl->cache_writeout_mutex); | ||
1138 | /* Write out the extent entries in the free space cache */ | 1162 | /* Write out the extent entries in the free space cache */ |
1139 | ret = write_cache_extent_entries(&io_ctl, ctl, | 1163 | ret = write_cache_extent_entries(&io_ctl, ctl, |
1140 | block_group, &entries, &bitmaps, | 1164 | block_group, &entries, &bitmaps, |
1141 | &bitmap_list); | 1165 | &bitmap_list); |
1142 | if (ret) | 1166 | if (ret) { |
1167 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
1143 | goto out_nospc; | 1168 | goto out_nospc; |
1169 | } | ||
1144 | 1170 | ||
1145 | /* | 1171 | /* |
1146 | * Some spaces that are freed in the current transaction are pinned, | 1172 | * Some spaces that are freed in the current transaction are pinned, |
@@ -1148,11 +1174,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, | |||
1148 | * committed, we shouldn't lose them. | 1174 | * committed, we shouldn't lose them. |
1149 | */ | 1175 | */ |
1150 | ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); | 1176 | ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); |
1151 | if (ret) | 1177 | if (ret) { |
1178 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
1152 | goto out_nospc; | 1179 | goto out_nospc; |
1180 | } | ||
1153 | 1181 | ||
1154 | /* At last, we write out all the bitmaps. */ | 1182 | /* |
1183 | * At last, we write out all the bitmaps and keep cache_writeout_mutex | ||
1184 | * locked while doing it because a concurrent trim can be manipulating | ||
1185 | * or freeing the bitmap. | ||
1186 | */ | ||
1155 | ret = write_bitmap_entries(&io_ctl, &bitmap_list); | 1187 | ret = write_bitmap_entries(&io_ctl, &bitmap_list); |
1188 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
1156 | if (ret) | 1189 | if (ret) |
1157 | goto out_nospc; | 1190 | goto out_nospc; |
1158 | 1191 | ||
@@ -2295,6 +2328,8 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) | |||
2295 | ctl->start = block_group->key.objectid; | 2328 | ctl->start = block_group->key.objectid; |
2296 | ctl->private = block_group; | 2329 | ctl->private = block_group; |
2297 | ctl->op = &free_space_op; | 2330 | ctl->op = &free_space_op; |
2331 | INIT_LIST_HEAD(&ctl->trimming_ranges); | ||
2332 | mutex_init(&ctl->cache_writeout_mutex); | ||
2298 | 2333 | ||
2299 | /* | 2334 | /* |
2300 | * we only want to have 32k of ram per block group for keeping | 2335 | * we only want to have 32k of ram per block group for keeping |
@@ -2911,10 +2946,12 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) | |||
2911 | 2946 | ||
2912 | static int do_trimming(struct btrfs_block_group_cache *block_group, | 2947 | static int do_trimming(struct btrfs_block_group_cache *block_group, |
2913 | u64 *total_trimmed, u64 start, u64 bytes, | 2948 | u64 *total_trimmed, u64 start, u64 bytes, |
2914 | u64 reserved_start, u64 reserved_bytes) | 2949 | u64 reserved_start, u64 reserved_bytes, |
2950 | struct btrfs_trim_range *trim_entry) | ||
2915 | { | 2951 | { |
2916 | struct btrfs_space_info *space_info = block_group->space_info; | 2952 | struct btrfs_space_info *space_info = block_group->space_info; |
2917 | struct btrfs_fs_info *fs_info = block_group->fs_info; | 2953 | struct btrfs_fs_info *fs_info = block_group->fs_info; |
2954 | struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; | ||
2918 | int ret; | 2955 | int ret; |
2919 | int update = 0; | 2956 | int update = 0; |
2920 | u64 trimmed = 0; | 2957 | u64 trimmed = 0; |
@@ -2934,7 +2971,10 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, | |||
2934 | if (!ret) | 2971 | if (!ret) |
2935 | *total_trimmed += trimmed; | 2972 | *total_trimmed += trimmed; |
2936 | 2973 | ||
2974 | mutex_lock(&ctl->cache_writeout_mutex); | ||
2937 | btrfs_add_free_space(block_group, reserved_start, reserved_bytes); | 2975 | btrfs_add_free_space(block_group, reserved_start, reserved_bytes); |
2976 | list_del(&trim_entry->list); | ||
2977 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
2938 | 2978 | ||
2939 | if (update) { | 2979 | if (update) { |
2940 | spin_lock(&space_info->lock); | 2980 | spin_lock(&space_info->lock); |
@@ -2962,16 +3002,21 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2962 | u64 bytes; | 3002 | u64 bytes; |
2963 | 3003 | ||
2964 | while (start < end) { | 3004 | while (start < end) { |
3005 | struct btrfs_trim_range trim_entry; | ||
3006 | |||
3007 | mutex_lock(&ctl->cache_writeout_mutex); | ||
2965 | spin_lock(&ctl->tree_lock); | 3008 | spin_lock(&ctl->tree_lock); |
2966 | 3009 | ||
2967 | if (ctl->free_space < minlen) { | 3010 | if (ctl->free_space < minlen) { |
2968 | spin_unlock(&ctl->tree_lock); | 3011 | spin_unlock(&ctl->tree_lock); |
3012 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
2969 | break; | 3013 | break; |
2970 | } | 3014 | } |
2971 | 3015 | ||
2972 | entry = tree_search_offset(ctl, start, 0, 1); | 3016 | entry = tree_search_offset(ctl, start, 0, 1); |
2973 | if (!entry) { | 3017 | if (!entry) { |
2974 | spin_unlock(&ctl->tree_lock); | 3018 | spin_unlock(&ctl->tree_lock); |
3019 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
2975 | break; | 3020 | break; |
2976 | } | 3021 | } |
2977 | 3022 | ||
@@ -2980,6 +3025,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2980 | node = rb_next(&entry->offset_index); | 3025 | node = rb_next(&entry->offset_index); |
2981 | if (!node) { | 3026 | if (!node) { |
2982 | spin_unlock(&ctl->tree_lock); | 3027 | spin_unlock(&ctl->tree_lock); |
3028 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
2983 | goto out; | 3029 | goto out; |
2984 | } | 3030 | } |
2985 | entry = rb_entry(node, struct btrfs_free_space, | 3031 | entry = rb_entry(node, struct btrfs_free_space, |
@@ -2988,6 +3034,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2988 | 3034 | ||
2989 | if (entry->offset >= end) { | 3035 | if (entry->offset >= end) { |
2990 | spin_unlock(&ctl->tree_lock); | 3036 | spin_unlock(&ctl->tree_lock); |
3037 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
2991 | break; | 3038 | break; |
2992 | } | 3039 | } |
2993 | 3040 | ||
@@ -2997,6 +3044,7 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
2997 | bytes = min(extent_start + extent_bytes, end) - start; | 3044 | bytes = min(extent_start + extent_bytes, end) - start; |
2998 | if (bytes < minlen) { | 3045 | if (bytes < minlen) { |
2999 | spin_unlock(&ctl->tree_lock); | 3046 | spin_unlock(&ctl->tree_lock); |
3047 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3000 | goto next; | 3048 | goto next; |
3001 | } | 3049 | } |
3002 | 3050 | ||
@@ -3004,9 +3052,13 @@ static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, | |||
3004 | kmem_cache_free(btrfs_free_space_cachep, entry); | 3052 | kmem_cache_free(btrfs_free_space_cachep, entry); |
3005 | 3053 | ||
3006 | spin_unlock(&ctl->tree_lock); | 3054 | spin_unlock(&ctl->tree_lock); |
3055 | trim_entry.start = extent_start; | ||
3056 | trim_entry.bytes = extent_bytes; | ||
3057 | list_add_tail(&trim_entry.list, &ctl->trimming_ranges); | ||
3058 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3007 | 3059 | ||
3008 | ret = do_trimming(block_group, total_trimmed, start, bytes, | 3060 | ret = do_trimming(block_group, total_trimmed, start, bytes, |
3009 | extent_start, extent_bytes); | 3061 | extent_start, extent_bytes, &trim_entry); |
3010 | if (ret) | 3062 | if (ret) |
3011 | break; | 3063 | break; |
3012 | next: | 3064 | next: |
@@ -3035,17 +3087,21 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, | |||
3035 | 3087 | ||
3036 | while (offset < end) { | 3088 | while (offset < end) { |
3037 | bool next_bitmap = false; | 3089 | bool next_bitmap = false; |
3090 | struct btrfs_trim_range trim_entry; | ||
3038 | 3091 | ||
3092 | mutex_lock(&ctl->cache_writeout_mutex); | ||
3039 | spin_lock(&ctl->tree_lock); | 3093 | spin_lock(&ctl->tree_lock); |
3040 | 3094 | ||
3041 | if (ctl->free_space < minlen) { | 3095 | if (ctl->free_space < minlen) { |
3042 | spin_unlock(&ctl->tree_lock); | 3096 | spin_unlock(&ctl->tree_lock); |
3097 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3043 | break; | 3098 | break; |
3044 | } | 3099 | } |
3045 | 3100 | ||
3046 | entry = tree_search_offset(ctl, offset, 1, 0); | 3101 | entry = tree_search_offset(ctl, offset, 1, 0); |
3047 | if (!entry) { | 3102 | if (!entry) { |
3048 | spin_unlock(&ctl->tree_lock); | 3103 | spin_unlock(&ctl->tree_lock); |
3104 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3049 | next_bitmap = true; | 3105 | next_bitmap = true; |
3050 | goto next; | 3106 | goto next; |
3051 | } | 3107 | } |
@@ -3054,6 +3110,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, | |||
3054 | ret2 = search_bitmap(ctl, entry, &start, &bytes); | 3110 | ret2 = search_bitmap(ctl, entry, &start, &bytes); |
3055 | if (ret2 || start >= end) { | 3111 | if (ret2 || start >= end) { |
3056 | spin_unlock(&ctl->tree_lock); | 3112 | spin_unlock(&ctl->tree_lock); |
3113 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3057 | next_bitmap = true; | 3114 | next_bitmap = true; |
3058 | goto next; | 3115 | goto next; |
3059 | } | 3116 | } |
@@ -3061,6 +3118,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, | |||
3061 | bytes = min(bytes, end - start); | 3118 | bytes = min(bytes, end - start); |
3062 | if (bytes < minlen) { | 3119 | if (bytes < minlen) { |
3063 | spin_unlock(&ctl->tree_lock); | 3120 | spin_unlock(&ctl->tree_lock); |
3121 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3064 | goto next; | 3122 | goto next; |
3065 | } | 3123 | } |
3066 | 3124 | ||
@@ -3069,9 +3127,13 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group, | |||
3069 | free_bitmap(ctl, entry); | 3127 | free_bitmap(ctl, entry); |
3070 | 3128 | ||
3071 | spin_unlock(&ctl->tree_lock); | 3129 | spin_unlock(&ctl->tree_lock); |
3130 | trim_entry.start = start; | ||
3131 | trim_entry.bytes = bytes; | ||
3132 | list_add_tail(&trim_entry.list, &ctl->trimming_ranges); | ||
3133 | mutex_unlock(&ctl->cache_writeout_mutex); | ||
3072 | 3134 | ||
3073 | ret = do_trimming(block_group, total_trimmed, start, bytes, | 3135 | ret = do_trimming(block_group, total_trimmed, start, bytes, |
3074 | start, bytes); | 3136 | start, bytes, &trim_entry); |
3075 | if (ret) | 3137 | if (ret) |
3076 | break; | 3138 | break; |
3077 | next: | 3139 | next: |
@@ -3101,11 +3163,52 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, | |||
3101 | 3163 | ||
3102 | *trimmed = 0; | 3164 | *trimmed = 0; |
3103 | 3165 | ||
3166 | spin_lock(&block_group->lock); | ||
3167 | if (block_group->removed) { | ||
3168 | spin_unlock(&block_group->lock); | ||
3169 | return 0; | ||
3170 | } | ||
3171 | atomic_inc(&block_group->trimming); | ||
3172 | spin_unlock(&block_group->lock); | ||
3173 | |||
3104 | ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); | 3174 | ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); |
3105 | if (ret) | 3175 | if (ret) |
3106 | return ret; | 3176 | goto out; |
3107 | 3177 | ||
3108 | ret = trim_bitmaps(block_group, trimmed, start, end, minlen); | 3178 | ret = trim_bitmaps(block_group, trimmed, start, end, minlen); |
3179 | out: | ||
3180 | spin_lock(&block_group->lock); | ||
3181 | if (atomic_dec_and_test(&block_group->trimming) && | ||
3182 | block_group->removed) { | ||
3183 | struct extent_map_tree *em_tree; | ||
3184 | struct extent_map *em; | ||
3185 | |||
3186 | spin_unlock(&block_group->lock); | ||
3187 | |||
3188 | em_tree = &block_group->fs_info->mapping_tree.map_tree; | ||
3189 | write_lock(&em_tree->lock); | ||
3190 | em = lookup_extent_mapping(em_tree, block_group->key.objectid, | ||
3191 | 1); | ||
3192 | BUG_ON(!em); /* logic error, can't happen */ | ||
3193 | remove_extent_mapping(em_tree, em); | ||
3194 | write_unlock(&em_tree->lock); | ||
3195 | |||
3196 | lock_chunks(block_group->fs_info->chunk_root); | ||
3197 | list_del_init(&em->list); | ||
3198 | unlock_chunks(block_group->fs_info->chunk_root); | ||
3199 | |||
3200 | /* once for us and once for the tree */ | ||
3201 | free_extent_map(em); | ||
3202 | free_extent_map(em); | ||
3203 | |||
3204 | /* | ||
3205 | * We've left one free space entry and other tasks trimming | ||
3206 | * this block group have left 1 entry each one. Free them. | ||
3207 | */ | ||
3208 | __btrfs_remove_free_space_cache(block_group->free_space_ctl); | ||
3209 | } else { | ||
3210 | spin_unlock(&block_group->lock); | ||
3211 | } | ||
3109 | 3212 | ||
3110 | return ret; | 3213 | return ret; |
3111 | } | 3214 | } |
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 0cf4977ef70d..88b2238a0aed 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h | |||
@@ -38,6 +38,8 @@ struct btrfs_free_space_ctl { | |||
38 | u64 start; | 38 | u64 start; |
39 | struct btrfs_free_space_op *op; | 39 | struct btrfs_free_space_op *op; |
40 | void *private; | 40 | void *private; |
41 | struct mutex cache_writeout_mutex; | ||
42 | struct list_head trimming_ranges; | ||
41 | }; | 43 | }; |
42 | 44 | ||
43 | struct btrfs_free_space_op { | 45 | struct btrfs_free_space_op { |
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 83d646bd2e4b..74faea3a516e 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c | |||
@@ -178,7 +178,7 @@ static void start_caching(struct btrfs_root *root) | |||
178 | root->root_key.objectid); | 178 | root->root_key.objectid); |
179 | if (IS_ERR(tsk)) { | 179 | if (IS_ERR(tsk)) { |
180 | btrfs_warn(root->fs_info, "failed to start inode caching task"); | 180 | btrfs_warn(root->fs_info, "failed to start inode caching task"); |
181 | btrfs_clear_and_info(root, CHANGE_INODE_CACHE, | 181 | btrfs_clear_pending_and_info(root->fs_info, INODE_MAP_CACHE, |
182 | "disabling inode map caching"); | 182 | "disabling inode map caching"); |
183 | } | 183 | } |
184 | } | 184 | } |
@@ -364,6 +364,8 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) | |||
364 | ctl->start = 0; | 364 | ctl->start = 0; |
365 | ctl->private = NULL; | 365 | ctl->private = NULL; |
366 | ctl->op = &free_ino_op; | 366 | ctl->op = &free_ino_op; |
367 | INIT_LIST_HEAD(&ctl->trimming_ranges); | ||
368 | mutex_init(&ctl->cache_writeout_mutex); | ||
367 | 369 | ||
368 | /* | 370 | /* |
369 | * Initially we allow to use 16K of ram to cache chunks of | 371 | * Initially we allow to use 16K of ram to cache chunks of |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ff0dcc016b71..e687bb0dc73a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -382,7 +382,7 @@ static inline int inode_need_compress(struct inode *inode) | |||
382 | * are written in the same order that the flusher thread sent them | 382 | * are written in the same order that the flusher thread sent them |
383 | * down. | 383 | * down. |
384 | */ | 384 | */ |
385 | static noinline int compress_file_range(struct inode *inode, | 385 | static noinline void compress_file_range(struct inode *inode, |
386 | struct page *locked_page, | 386 | struct page *locked_page, |
387 | u64 start, u64 end, | 387 | u64 start, u64 end, |
388 | struct async_cow *async_cow, | 388 | struct async_cow *async_cow, |
@@ -411,14 +411,6 @@ static noinline int compress_file_range(struct inode *inode, | |||
411 | (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) | 411 | (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) |
412 | btrfs_add_inode_defrag(NULL, inode); | 412 | btrfs_add_inode_defrag(NULL, inode); |
413 | 413 | ||
414 | /* | ||
415 | * skip compression for a small file range(<=blocksize) that | ||
416 | * isn't an inline extent, since it dosen't save disk space at all. | ||
417 | */ | ||
418 | if ((end - start + 1) <= blocksize && | ||
419 | (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) | ||
420 | goto cleanup_and_bail_uncompressed; | ||
421 | |||
422 | actual_end = min_t(u64, isize, end + 1); | 414 | actual_end = min_t(u64, isize, end + 1); |
423 | again: | 415 | again: |
424 | will_compress = 0; | 416 | will_compress = 0; |
@@ -440,6 +432,14 @@ again: | |||
440 | 432 | ||
441 | total_compressed = actual_end - start; | 433 | total_compressed = actual_end - start; |
442 | 434 | ||
435 | /* | ||
436 | * skip compression for a small file range(<=blocksize) that | ||
437 | * isn't an inline extent, since it dosen't save disk space at all. | ||
438 | */ | ||
439 | if (total_compressed <= blocksize && | ||
440 | (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) | ||
441 | goto cleanup_and_bail_uncompressed; | ||
442 | |||
443 | /* we want to make sure that amount of ram required to uncompress | 443 | /* we want to make sure that amount of ram required to uncompress |
444 | * an extent is reasonable, so we limit the total size in ram | 444 | * an extent is reasonable, so we limit the total size in ram |
445 | * of a compressed extent to 128k. This is a crucial number | 445 | * of a compressed extent to 128k. This is a crucial number |
@@ -527,7 +527,10 @@ cont: | |||
527 | if (ret <= 0) { | 527 | if (ret <= 0) { |
528 | unsigned long clear_flags = EXTENT_DELALLOC | | 528 | unsigned long clear_flags = EXTENT_DELALLOC | |
529 | EXTENT_DEFRAG; | 529 | EXTENT_DEFRAG; |
530 | unsigned long page_error_op; | ||
531 | |||
530 | clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; | 532 | clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; |
533 | page_error_op = ret < 0 ? PAGE_SET_ERROR : 0; | ||
531 | 534 | ||
532 | /* | 535 | /* |
533 | * inline extent creation worked or returned error, | 536 | * inline extent creation worked or returned error, |
@@ -538,6 +541,7 @@ cont: | |||
538 | clear_flags, PAGE_UNLOCK | | 541 | clear_flags, PAGE_UNLOCK | |
539 | PAGE_CLEAR_DIRTY | | 542 | PAGE_CLEAR_DIRTY | |
540 | PAGE_SET_WRITEBACK | | 543 | PAGE_SET_WRITEBACK | |
544 | page_error_op | | ||
541 | PAGE_END_WRITEBACK); | 545 | PAGE_END_WRITEBACK); |
542 | goto free_pages_out; | 546 | goto free_pages_out; |
543 | } | 547 | } |
@@ -620,8 +624,7 @@ cleanup_and_bail_uncompressed: | |||
620 | *num_added += 1; | 624 | *num_added += 1; |
621 | } | 625 | } |
622 | 626 | ||
623 | out: | 627 | return; |
624 | return ret; | ||
625 | 628 | ||
626 | free_pages_out: | 629 | free_pages_out: |
627 | for (i = 0; i < nr_pages_ret; i++) { | 630 | for (i = 0; i < nr_pages_ret; i++) { |
@@ -629,8 +632,22 @@ free_pages_out: | |||
629 | page_cache_release(pages[i]); | 632 | page_cache_release(pages[i]); |
630 | } | 633 | } |
631 | kfree(pages); | 634 | kfree(pages); |
635 | } | ||
632 | 636 | ||
633 | goto out; | 637 | static void free_async_extent_pages(struct async_extent *async_extent) |
638 | { | ||
639 | int i; | ||
640 | |||
641 | if (!async_extent->pages) | ||
642 | return; | ||
643 | |||
644 | for (i = 0; i < async_extent->nr_pages; i++) { | ||
645 | WARN_ON(async_extent->pages[i]->mapping); | ||
646 | page_cache_release(async_extent->pages[i]); | ||
647 | } | ||
648 | kfree(async_extent->pages); | ||
649 | async_extent->nr_pages = 0; | ||
650 | async_extent->pages = NULL; | ||
634 | } | 651 | } |
635 | 652 | ||
636 | /* | 653 | /* |
@@ -639,7 +656,7 @@ free_pages_out: | |||
639 | * queued. We walk all the async extents created by compress_file_range | 656 | * queued. We walk all the async extents created by compress_file_range |
640 | * and send them down to the disk. | 657 | * and send them down to the disk. |
641 | */ | 658 | */ |
642 | static noinline int submit_compressed_extents(struct inode *inode, | 659 | static noinline void submit_compressed_extents(struct inode *inode, |
643 | struct async_cow *async_cow) | 660 | struct async_cow *async_cow) |
644 | { | 661 | { |
645 | struct async_extent *async_extent; | 662 | struct async_extent *async_extent; |
@@ -651,9 +668,6 @@ static noinline int submit_compressed_extents(struct inode *inode, | |||
651 | struct extent_io_tree *io_tree; | 668 | struct extent_io_tree *io_tree; |
652 | int ret = 0; | 669 | int ret = 0; |
653 | 670 | ||
654 | if (list_empty(&async_cow->extents)) | ||
655 | return 0; | ||
656 | |||
657 | again: | 671 | again: |
658 | while (!list_empty(&async_cow->extents)) { | 672 | while (!list_empty(&async_cow->extents)) { |
659 | async_extent = list_entry(async_cow->extents.next, | 673 | async_extent = list_entry(async_cow->extents.next, |
@@ -709,15 +723,7 @@ retry: | |||
709 | async_extent->compressed_size, | 723 | async_extent->compressed_size, |
710 | 0, alloc_hint, &ins, 1, 1); | 724 | 0, alloc_hint, &ins, 1, 1); |
711 | if (ret) { | 725 | if (ret) { |
712 | int i; | 726 | free_async_extent_pages(async_extent); |
713 | |||
714 | for (i = 0; i < async_extent->nr_pages; i++) { | ||
715 | WARN_ON(async_extent->pages[i]->mapping); | ||
716 | page_cache_release(async_extent->pages[i]); | ||
717 | } | ||
718 | kfree(async_extent->pages); | ||
719 | async_extent->nr_pages = 0; | ||
720 | async_extent->pages = NULL; | ||
721 | 727 | ||
722 | if (ret == -ENOSPC) { | 728 | if (ret == -ENOSPC) { |
723 | unlock_extent(io_tree, async_extent->start, | 729 | unlock_extent(io_tree, async_extent->start, |
@@ -814,15 +820,26 @@ retry: | |||
814 | ins.objectid, | 820 | ins.objectid, |
815 | ins.offset, async_extent->pages, | 821 | ins.offset, async_extent->pages, |
816 | async_extent->nr_pages); | 822 | async_extent->nr_pages); |
823 | if (ret) { | ||
824 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
825 | struct page *p = async_extent->pages[0]; | ||
826 | const u64 start = async_extent->start; | ||
827 | const u64 end = start + async_extent->ram_size - 1; | ||
828 | |||
829 | p->mapping = inode->i_mapping; | ||
830 | tree->ops->writepage_end_io_hook(p, start, end, | ||
831 | NULL, 0); | ||
832 | p->mapping = NULL; | ||
833 | extent_clear_unlock_delalloc(inode, start, end, NULL, 0, | ||
834 | PAGE_END_WRITEBACK | | ||
835 | PAGE_SET_ERROR); | ||
836 | free_async_extent_pages(async_extent); | ||
837 | } | ||
817 | alloc_hint = ins.objectid + ins.offset; | 838 | alloc_hint = ins.objectid + ins.offset; |
818 | kfree(async_extent); | 839 | kfree(async_extent); |
819 | if (ret) | ||
820 | goto out; | ||
821 | cond_resched(); | 840 | cond_resched(); |
822 | } | 841 | } |
823 | ret = 0; | 842 | return; |
824 | out: | ||
825 | return ret; | ||
826 | out_free_reserve: | 843 | out_free_reserve: |
827 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); | 844 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); |
828 | out_free: | 845 | out_free: |
@@ -832,7 +849,9 @@ out_free: | |||
832 | NULL, EXTENT_LOCKED | EXTENT_DELALLOC | | 849 | NULL, EXTENT_LOCKED | EXTENT_DELALLOC | |
833 | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, | 850 | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, |
834 | PAGE_UNLOCK | PAGE_CLEAR_DIRTY | | 851 | PAGE_UNLOCK | PAGE_CLEAR_DIRTY | |
835 | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); | 852 | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | |
853 | PAGE_SET_ERROR); | ||
854 | free_async_extent_pages(async_extent); | ||
836 | kfree(async_extent); | 855 | kfree(async_extent); |
837 | goto again; | 856 | goto again; |
838 | } | 857 | } |
@@ -1318,7 +1337,7 @@ next_slot: | |||
1318 | * we fall into common COW way. | 1337 | * we fall into common COW way. |
1319 | */ | 1338 | */ |
1320 | if (!nolock) { | 1339 | if (!nolock) { |
1321 | err = btrfs_start_nocow_write(root); | 1340 | err = btrfs_start_write_no_snapshoting(root); |
1322 | if (!err) | 1341 | if (!err) |
1323 | goto out_check; | 1342 | goto out_check; |
1324 | } | 1343 | } |
@@ -1342,7 +1361,7 @@ out_check: | |||
1342 | if (extent_end <= start) { | 1361 | if (extent_end <= start) { |
1343 | path->slots[0]++; | 1362 | path->slots[0]++; |
1344 | if (!nolock && nocow) | 1363 | if (!nolock && nocow) |
1345 | btrfs_end_nocow_write(root); | 1364 | btrfs_end_write_no_snapshoting(root); |
1346 | goto next_slot; | 1365 | goto next_slot; |
1347 | } | 1366 | } |
1348 | if (!nocow) { | 1367 | if (!nocow) { |
@@ -1362,7 +1381,7 @@ out_check: | |||
1362 | page_started, nr_written, 1); | 1381 | page_started, nr_written, 1); |
1363 | if (ret) { | 1382 | if (ret) { |
1364 | if (!nolock && nocow) | 1383 | if (!nolock && nocow) |
1365 | btrfs_end_nocow_write(root); | 1384 | btrfs_end_write_no_snapshoting(root); |
1366 | goto error; | 1385 | goto error; |
1367 | } | 1386 | } |
1368 | cow_start = (u64)-1; | 1387 | cow_start = (u64)-1; |
@@ -1413,7 +1432,7 @@ out_check: | |||
1413 | num_bytes); | 1432 | num_bytes); |
1414 | if (ret) { | 1433 | if (ret) { |
1415 | if (!nolock && nocow) | 1434 | if (!nolock && nocow) |
1416 | btrfs_end_nocow_write(root); | 1435 | btrfs_end_write_no_snapshoting(root); |
1417 | goto error; | 1436 | goto error; |
1418 | } | 1437 | } |
1419 | } | 1438 | } |
@@ -1424,7 +1443,7 @@ out_check: | |||
1424 | EXTENT_DELALLOC, PAGE_UNLOCK | | 1443 | EXTENT_DELALLOC, PAGE_UNLOCK | |
1425 | PAGE_SET_PRIVATE2); | 1444 | PAGE_SET_PRIVATE2); |
1426 | if (!nolock && nocow) | 1445 | if (!nolock && nocow) |
1427 | btrfs_end_nocow_write(root); | 1446 | btrfs_end_write_no_snapshoting(root); |
1428 | cur_offset = extent_end; | 1447 | cur_offset = extent_end; |
1429 | if (cur_offset > end) | 1448 | if (cur_offset > end) |
1430 | break; | 1449 | break; |
@@ -4580,6 +4599,26 @@ next: | |||
4580 | return err; | 4599 | return err; |
4581 | } | 4600 | } |
4582 | 4601 | ||
4602 | static int wait_snapshoting_atomic_t(atomic_t *a) | ||
4603 | { | ||
4604 | schedule(); | ||
4605 | return 0; | ||
4606 | } | ||
4607 | |||
4608 | static void wait_for_snapshot_creation(struct btrfs_root *root) | ||
4609 | { | ||
4610 | while (true) { | ||
4611 | int ret; | ||
4612 | |||
4613 | ret = btrfs_start_write_no_snapshoting(root); | ||
4614 | if (ret) | ||
4615 | break; | ||
4616 | wait_on_atomic_t(&root->will_be_snapshoted, | ||
4617 | wait_snapshoting_atomic_t, | ||
4618 | TASK_UNINTERRUPTIBLE); | ||
4619 | } | ||
4620 | } | ||
4621 | |||
4583 | static int btrfs_setsize(struct inode *inode, struct iattr *attr) | 4622 | static int btrfs_setsize(struct inode *inode, struct iattr *attr) |
4584 | { | 4623 | { |
4585 | struct btrfs_root *root = BTRFS_I(inode)->root; | 4624 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -4604,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) | |||
4604 | 4643 | ||
4605 | if (newsize > oldsize) { | 4644 | if (newsize > oldsize) { |
4606 | truncate_pagecache(inode, newsize); | 4645 | truncate_pagecache(inode, newsize); |
4646 | /* | ||
4647 | * Don't do an expanding truncate while snapshoting is ongoing. | ||
4648 | * This is to ensure the snapshot captures a fully consistent | ||
4649 | * state of this file - if the snapshot captures this expanding | ||
4650 | * truncation, it must capture all writes that happened before | ||
4651 | * this truncation. | ||
4652 | */ | ||
4653 | wait_for_snapshot_creation(root); | ||
4607 | ret = btrfs_cont_expand(inode, oldsize, newsize); | 4654 | ret = btrfs_cont_expand(inode, oldsize, newsize); |
4608 | if (ret) | 4655 | if (ret) { |
4656 | btrfs_end_write_no_snapshoting(root); | ||
4609 | return ret; | 4657 | return ret; |
4658 | } | ||
4610 | 4659 | ||
4611 | trans = btrfs_start_transaction(root, 1); | 4660 | trans = btrfs_start_transaction(root, 1); |
4612 | if (IS_ERR(trans)) | 4661 | if (IS_ERR(trans)) { |
4662 | btrfs_end_write_no_snapshoting(root); | ||
4613 | return PTR_ERR(trans); | 4663 | return PTR_ERR(trans); |
4664 | } | ||
4614 | 4665 | ||
4615 | i_size_write(inode, newsize); | 4666 | i_size_write(inode, newsize); |
4616 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); | 4667 | btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); |
4617 | ret = btrfs_update_inode(trans, root, inode); | 4668 | ret = btrfs_update_inode(trans, root, inode); |
4669 | btrfs_end_write_no_snapshoting(root); | ||
4618 | btrfs_end_transaction(trans, root); | 4670 | btrfs_end_transaction(trans, root); |
4619 | } else { | 4671 | } else { |
4620 | 4672 | ||
@@ -7000,9 +7052,12 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, | |||
7000 | btrfs_put_ordered_extent(ordered); | 7052 | btrfs_put_ordered_extent(ordered); |
7001 | } else { | 7053 | } else { |
7002 | /* Screw you mmap */ | 7054 | /* Screw you mmap */ |
7003 | ret = filemap_write_and_wait_range(inode->i_mapping, | 7055 | ret = btrfs_fdatawrite_range(inode, lockstart, lockend); |
7004 | lockstart, | 7056 | if (ret) |
7005 | lockend); | 7057 | break; |
7058 | ret = filemap_fdatawait_range(inode->i_mapping, | ||
7059 | lockstart, | ||
7060 | lockend); | ||
7006 | if (ret) | 7061 | if (ret) |
7007 | break; | 7062 | break; |
7008 | 7063 | ||
@@ -9442,6 +9497,21 @@ out_inode: | |||
9442 | 9497 | ||
9443 | } | 9498 | } |
9444 | 9499 | ||
9500 | /* Inspired by filemap_check_errors() */ | ||
9501 | int btrfs_inode_check_errors(struct inode *inode) | ||
9502 | { | ||
9503 | int ret = 0; | ||
9504 | |||
9505 | if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) && | ||
9506 | test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags)) | ||
9507 | ret = -ENOSPC; | ||
9508 | if (test_bit(AS_EIO, &inode->i_mapping->flags) && | ||
9509 | test_and_clear_bit(AS_EIO, &inode->i_mapping->flags)) | ||
9510 | ret = -EIO; | ||
9511 | |||
9512 | return ret; | ||
9513 | } | ||
9514 | |||
9445 | static const struct inode_operations btrfs_dir_inode_operations = { | 9515 | static const struct inode_operations btrfs_dir_inode_operations = { |
9446 | .getattr = btrfs_getattr, | 9516 | .getattr = btrfs_getattr, |
9447 | .lookup = btrfs_lookup, | 9517 | .lookup = btrfs_lookup, |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 080fe66c0349..d49fe8a0f6b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -617,7 +617,7 @@ fail: | |||
617 | return ret; | 617 | return ret; |
618 | } | 618 | } |
619 | 619 | ||
620 | static void btrfs_wait_nocow_write(struct btrfs_root *root) | 620 | static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) |
621 | { | 621 | { |
622 | s64 writers; | 622 | s64 writers; |
623 | DEFINE_WAIT(wait); | 623 | DEFINE_WAIT(wait); |
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, | |||
649 | 649 | ||
650 | atomic_inc(&root->will_be_snapshoted); | 650 | atomic_inc(&root->will_be_snapshoted); |
651 | smp_mb__after_atomic(); | 651 | smp_mb__after_atomic(); |
652 | btrfs_wait_nocow_write(root); | 652 | btrfs_wait_for_no_snapshoting_writes(root); |
653 | 653 | ||
654 | ret = btrfs_start_delalloc_inodes(root, 0); | 654 | ret = btrfs_start_delalloc_inodes(root, 0); |
655 | if (ret) | 655 | if (ret) |
@@ -717,35 +717,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, | |||
717 | if (ret) | 717 | if (ret) |
718 | goto fail; | 718 | goto fail; |
719 | 719 | ||
720 | /* | ||
721 | * If orphan cleanup did remove any orphans, it means the tree was | ||
722 | * modified and therefore the commit root is not the same as the | ||
723 | * current root anymore. This is a problem, because send uses the | ||
724 | * commit root and therefore can see inode items that don't exist | ||
725 | * in the current root anymore, and for example make calls to | ||
726 | * btrfs_iget, which will do tree lookups based on the current root | ||
727 | * and not on the commit root. Those lookups will fail, returning a | ||
728 | * -ESTALE error, and making send fail with that error. So make sure | ||
729 | * a send does not see any orphans we have just removed, and that it | ||
730 | * will see the same inodes regardless of whether a transaction | ||
731 | * commit happened before it started (meaning that the commit root | ||
732 | * will be the same as the current root) or not. | ||
733 | */ | ||
734 | if (readonly && pending_snapshot->snap->node != | ||
735 | pending_snapshot->snap->commit_root) { | ||
736 | trans = btrfs_join_transaction(pending_snapshot->snap); | ||
737 | if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { | ||
738 | ret = PTR_ERR(trans); | ||
739 | goto fail; | ||
740 | } | ||
741 | if (!IS_ERR(trans)) { | ||
742 | ret = btrfs_commit_transaction(trans, | ||
743 | pending_snapshot->snap); | ||
744 | if (ret) | ||
745 | goto fail; | ||
746 | } | ||
747 | } | ||
748 | |||
749 | inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); | 720 | inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); |
750 | if (IS_ERR(inode)) { | 721 | if (IS_ERR(inode)) { |
751 | ret = PTR_ERR(inode); | 722 | ret = PTR_ERR(inode); |
@@ -761,7 +732,8 @@ fail: | |||
761 | free: | 732 | free: |
762 | kfree(pending_snapshot); | 733 | kfree(pending_snapshot); |
763 | out: | 734 | out: |
764 | atomic_dec(&root->will_be_snapshoted); | 735 | if (atomic_dec_and_test(&root->will_be_snapshoted)) |
736 | wake_up_atomic_t(&root->will_be_snapshoted); | ||
765 | return ret; | 737 | return ret; |
766 | } | 738 | } |
767 | 739 | ||
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec4cc20..534544e08f76 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
220 | INIT_LIST_HEAD(&entry->work_list); | 220 | INIT_LIST_HEAD(&entry->work_list); |
221 | init_completion(&entry->completion); | 221 | init_completion(&entry->completion); |
222 | INIT_LIST_HEAD(&entry->log_list); | 222 | INIT_LIST_HEAD(&entry->log_list); |
223 | INIT_LIST_HEAD(&entry->trans_list); | ||
223 | 224 | ||
224 | trace_btrfs_ordered_extent_add(inode, entry); | 225 | trace_btrfs_ordered_extent_add(inode, entry); |
225 | 226 | ||
@@ -431,19 +432,31 @@ out: | |||
431 | 432 | ||
432 | /* Needs to either be called under a log transaction or the log_mutex */ | 433 | /* Needs to either be called under a log transaction or the log_mutex */ |
433 | void btrfs_get_logged_extents(struct inode *inode, | 434 | void btrfs_get_logged_extents(struct inode *inode, |
434 | struct list_head *logged_list) | 435 | struct list_head *logged_list, |
436 | const loff_t start, | ||
437 | const loff_t end) | ||
435 | { | 438 | { |
436 | struct btrfs_ordered_inode_tree *tree; | 439 | struct btrfs_ordered_inode_tree *tree; |
437 | struct btrfs_ordered_extent *ordered; | 440 | struct btrfs_ordered_extent *ordered; |
438 | struct rb_node *n; | 441 | struct rb_node *n; |
442 | struct rb_node *prev; | ||
439 | 443 | ||
440 | tree = &BTRFS_I(inode)->ordered_tree; | 444 | tree = &BTRFS_I(inode)->ordered_tree; |
441 | spin_lock_irq(&tree->lock); | 445 | spin_lock_irq(&tree->lock); |
442 | for (n = rb_first(&tree->tree); n; n = rb_next(n)) { | 446 | n = __tree_search(&tree->tree, end, &prev); |
447 | if (!n) | ||
448 | n = prev; | ||
449 | for (; n; n = rb_prev(n)) { | ||
443 | ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); | 450 | ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); |
451 | if (ordered->file_offset > end) | ||
452 | continue; | ||
453 | if (entry_end(ordered) <= start) | ||
454 | break; | ||
444 | if (!list_empty(&ordered->log_list)) | 455 | if (!list_empty(&ordered->log_list)) |
445 | continue; | 456 | continue; |
446 | list_add_tail(&ordered->log_list, logged_list); | 457 | if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) |
458 | continue; | ||
459 | list_add(&ordered->log_list, logged_list); | ||
447 | atomic_inc(&ordered->refs); | 460 | atomic_inc(&ordered->refs); |
448 | } | 461 | } |
449 | spin_unlock_irq(&tree->lock); | 462 | spin_unlock_irq(&tree->lock); |
@@ -472,7 +485,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, | |||
472 | spin_unlock_irq(&log->log_extents_lock[index]); | 485 | spin_unlock_irq(&log->log_extents_lock[index]); |
473 | } | 486 | } |
474 | 487 | ||
475 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) | 488 | void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, |
489 | struct btrfs_root *log, u64 transid) | ||
476 | { | 490 | { |
477 | struct btrfs_ordered_extent *ordered; | 491 | struct btrfs_ordered_extent *ordered; |
478 | int index = transid % 2; | 492 | int index = transid % 2; |
@@ -497,7 +511,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) | |||
497 | wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, | 511 | wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, |
498 | &ordered->flags)); | 512 | &ordered->flags)); |
499 | 513 | ||
500 | btrfs_put_ordered_extent(ordered); | 514 | if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) |
515 | list_add_tail(&ordered->trans_list, &trans->ordered); | ||
501 | spin_lock_irq(&log->log_extents_lock[index]); | 516 | spin_lock_irq(&log->log_extents_lock[index]); |
502 | } | 517 | } |
503 | spin_unlock_irq(&log->log_extents_lock[index]); | 518 | spin_unlock_irq(&log->log_extents_lock[index]); |
@@ -725,30 +740,10 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) | |||
725 | /* start IO across the range first to instantiate any delalloc | 740 | /* start IO across the range first to instantiate any delalloc |
726 | * extents | 741 | * extents |
727 | */ | 742 | */ |
728 | ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end); | 743 | ret = btrfs_fdatawrite_range(inode, start, orig_end); |
729 | if (ret) | 744 | if (ret) |
730 | return ret; | 745 | return ret; |
731 | /* | 746 | |
732 | * So with compression we will find and lock a dirty page and clear the | ||
733 | * first one as dirty, setup an async extent, and immediately return | ||
734 | * with the entire range locked but with nobody actually marked with | ||
735 | * writeback. So we can't just filemap_write_and_wait_range() and | ||
736 | * expect it to work since it will just kick off a thread to do the | ||
737 | * actual work. So we need to call filemap_fdatawrite_range _again_ | ||
738 | * since it will wait on the page lock, which won't be unlocked until | ||
739 | * after the pages have been marked as writeback and so we're good to go | ||
740 | * from there. We have to do this otherwise we'll miss the ordered | ||
741 | * extents and that results in badness. Please Josef, do not think you | ||
742 | * know better and pull this out at some point in the future, it is | ||
743 | * right and you are wrong. | ||
744 | */ | ||
745 | if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, | ||
746 | &BTRFS_I(inode)->runtime_flags)) { | ||
747 | ret = filemap_fdatawrite_range(inode->i_mapping, start, | ||
748 | orig_end); | ||
749 | if (ret) | ||
750 | return ret; | ||
751 | } | ||
752 | ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); | 747 | ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); |
753 | if (ret) | 748 | if (ret) |
754 | return ret; | 749 | return ret; |
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274d621e..e96cd4ccd805 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum { | |||
71 | ordered extent */ | 71 | ordered extent */ |
72 | #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ | 72 | #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ |
73 | 73 | ||
74 | #define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent | ||
75 | * in the logging code. */ | ||
74 | struct btrfs_ordered_extent { | 76 | struct btrfs_ordered_extent { |
75 | /* logical offset in the file */ | 77 | /* logical offset in the file */ |
76 | u64 file_offset; | 78 | u64 file_offset; |
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent { | |||
121 | /* If we need to wait on this to be done */ | 123 | /* If we need to wait on this to be done */ |
122 | struct list_head log_list; | 124 | struct list_head log_list; |
123 | 125 | ||
126 | /* If the transaction needs to wait on this ordered extent */ | ||
127 | struct list_head trans_list; | ||
128 | |||
124 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ | 129 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ |
125 | wait_queue_head_t wait; | 130 | wait_queue_head_t wait; |
126 | 131 | ||
@@ -193,11 +198,14 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, | |||
193 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); | 198 | int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); |
194 | void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); | 199 | void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); |
195 | void btrfs_get_logged_extents(struct inode *inode, | 200 | void btrfs_get_logged_extents(struct inode *inode, |
196 | struct list_head *logged_list); | 201 | struct list_head *logged_list, |
202 | const loff_t start, | ||
203 | const loff_t end); | ||
197 | void btrfs_put_logged_extents(struct list_head *logged_list); | 204 | void btrfs_put_logged_extents(struct list_head *logged_list); |
198 | void btrfs_submit_logged_extents(struct list_head *logged_list, | 205 | void btrfs_submit_logged_extents(struct list_head *logged_list, |
199 | struct btrfs_root *log); | 206 | struct btrfs_root *log); |
200 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); | 207 | void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, |
208 | struct btrfs_root *log, u64 transid); | ||
201 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); | 209 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); |
202 | int __init ordered_data_init(void); | 210 | int __init ordered_data_init(void); |
203 | void ordered_data_exit(void); | 211 | void ordered_data_exit(void); |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6a41631cb959..8ab2a17bbba8 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c | |||
@@ -58,9 +58,23 @@ | |||
58 | */ | 58 | */ |
59 | #define RBIO_CACHE_READY_BIT 3 | 59 | #define RBIO_CACHE_READY_BIT 3 |
60 | 60 | ||
61 | /* | ||
62 | * bbio and raid_map is managed by the caller, so we shouldn't free | ||
63 | * them here. And besides that, all rbios with this flag should not | ||
64 | * be cached, because we need raid_map to check the rbios' stripe | ||
65 | * is the same or not, but it is very likely that the caller has | ||
66 | * free raid_map, so don't cache those rbios. | ||
67 | */ | ||
68 | #define RBIO_HOLD_BBIO_MAP_BIT 4 | ||
61 | 69 | ||
62 | #define RBIO_CACHE_SIZE 1024 | 70 | #define RBIO_CACHE_SIZE 1024 |
63 | 71 | ||
72 | enum btrfs_rbio_ops { | ||
73 | BTRFS_RBIO_WRITE = 0, | ||
74 | BTRFS_RBIO_READ_REBUILD = 1, | ||
75 | BTRFS_RBIO_PARITY_SCRUB = 2, | ||
76 | }; | ||
77 | |||
64 | struct btrfs_raid_bio { | 78 | struct btrfs_raid_bio { |
65 | struct btrfs_fs_info *fs_info; | 79 | struct btrfs_fs_info *fs_info; |
66 | struct btrfs_bio *bbio; | 80 | struct btrfs_bio *bbio; |
@@ -117,13 +131,16 @@ struct btrfs_raid_bio { | |||
117 | /* number of data stripes (no p/q) */ | 131 | /* number of data stripes (no p/q) */ |
118 | int nr_data; | 132 | int nr_data; |
119 | 133 | ||
134 | int real_stripes; | ||
135 | |||
136 | int stripe_npages; | ||
120 | /* | 137 | /* |
121 | * set if we're doing a parity rebuild | 138 | * set if we're doing a parity rebuild |
122 | * for a read from higher up, which is handled | 139 | * for a read from higher up, which is handled |
123 | * differently from a parity rebuild as part of | 140 | * differently from a parity rebuild as part of |
124 | * rmw | 141 | * rmw |
125 | */ | 142 | */ |
126 | int read_rebuild; | 143 | enum btrfs_rbio_ops operation; |
127 | 144 | ||
128 | /* first bad stripe */ | 145 | /* first bad stripe */ |
129 | int faila; | 146 | int faila; |
@@ -131,6 +148,7 @@ struct btrfs_raid_bio { | |||
131 | /* second bad stripe (for raid6 use) */ | 148 | /* second bad stripe (for raid6 use) */ |
132 | int failb; | 149 | int failb; |
133 | 150 | ||
151 | int scrubp; | ||
134 | /* | 152 | /* |
135 | * number of pages needed to represent the full | 153 | * number of pages needed to represent the full |
136 | * stripe | 154 | * stripe |
@@ -144,8 +162,13 @@ struct btrfs_raid_bio { | |||
144 | */ | 162 | */ |
145 | int bio_list_bytes; | 163 | int bio_list_bytes; |
146 | 164 | ||
165 | int generic_bio_cnt; | ||
166 | |||
147 | atomic_t refs; | 167 | atomic_t refs; |
148 | 168 | ||
169 | atomic_t stripes_pending; | ||
170 | |||
171 | atomic_t error; | ||
149 | /* | 172 | /* |
150 | * these are two arrays of pointers. We allocate the | 173 | * these are two arrays of pointers. We allocate the |
151 | * rbio big enough to hold them both and setup their | 174 | * rbio big enough to hold them both and setup their |
@@ -162,6 +185,11 @@ struct btrfs_raid_bio { | |||
162 | * here for faster lookup | 185 | * here for faster lookup |
163 | */ | 186 | */ |
164 | struct page **bio_pages; | 187 | struct page **bio_pages; |
188 | |||
189 | /* | ||
190 | * bitmap to record which horizontal stripe has data | ||
191 | */ | ||
192 | unsigned long *dbitmap; | ||
165 | }; | 193 | }; |
166 | 194 | ||
167 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | 195 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); |
@@ -176,6 +204,10 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio); | |||
176 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | 204 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); |
177 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | 205 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); |
178 | 206 | ||
207 | static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, | ||
208 | int need_check); | ||
209 | static void async_scrub_parity(struct btrfs_raid_bio *rbio); | ||
210 | |||
179 | /* | 211 | /* |
180 | * the stripe hash table is used for locking, and to collect | 212 | * the stripe hash table is used for locking, and to collect |
181 | * bios in hopes of making a full stripe | 213 | * bios in hopes of making a full stripe |
@@ -324,6 +356,7 @@ static void merge_rbio(struct btrfs_raid_bio *dest, | |||
324 | { | 356 | { |
325 | bio_list_merge(&dest->bio_list, &victim->bio_list); | 357 | bio_list_merge(&dest->bio_list, &victim->bio_list); |
326 | dest->bio_list_bytes += victim->bio_list_bytes; | 358 | dest->bio_list_bytes += victim->bio_list_bytes; |
359 | dest->generic_bio_cnt += victim->generic_bio_cnt; | ||
327 | bio_list_init(&victim->bio_list); | 360 | bio_list_init(&victim->bio_list); |
328 | } | 361 | } |
329 | 362 | ||
@@ -577,11 +610,20 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, | |||
577 | cur->raid_map[0]) | 610 | cur->raid_map[0]) |
578 | return 0; | 611 | return 0; |
579 | 612 | ||
580 | /* reads can't merge with writes */ | 613 | /* we can't merge with different operations */ |
581 | if (last->read_rebuild != | 614 | if (last->operation != cur->operation) |
582 | cur->read_rebuild) { | 615 | return 0; |
616 | /* | ||
617 | * We've need read the full stripe from the drive. | ||
618 | * check and repair the parity and write the new results. | ||
619 | * | ||
620 | * We're not allowed to add any new bios to the | ||
621 | * bio list here, anyone else that wants to | ||
622 | * change this stripe needs to do their own rmw. | ||
623 | */ | ||
624 | if (last->operation == BTRFS_RBIO_PARITY_SCRUB || | ||
625 | cur->operation == BTRFS_RBIO_PARITY_SCRUB) | ||
583 | return 0; | 626 | return 0; |
584 | } | ||
585 | 627 | ||
586 | return 1; | 628 | return 1; |
587 | } | 629 | } |
@@ -601,7 +643,7 @@ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | |||
601 | */ | 643 | */ |
602 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | 644 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) |
603 | { | 645 | { |
604 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | 646 | if (rbio->nr_data + 1 == rbio->real_stripes) |
605 | return NULL; | 647 | return NULL; |
606 | 648 | ||
607 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | 649 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> |
@@ -772,11 +814,14 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | |||
772 | spin_unlock(&rbio->bio_list_lock); | 814 | spin_unlock(&rbio->bio_list_lock); |
773 | spin_unlock_irqrestore(&h->lock, flags); | 815 | spin_unlock_irqrestore(&h->lock, flags); |
774 | 816 | ||
775 | if (next->read_rebuild) | 817 | if (next->operation == BTRFS_RBIO_READ_REBUILD) |
776 | async_read_rebuild(next); | 818 | async_read_rebuild(next); |
777 | else { | 819 | else if (next->operation == BTRFS_RBIO_WRITE) { |
778 | steal_rbio(rbio, next); | 820 | steal_rbio(rbio, next); |
779 | async_rmw_stripe(next); | 821 | async_rmw_stripe(next); |
822 | } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { | ||
823 | steal_rbio(rbio, next); | ||
824 | async_scrub_parity(next); | ||
780 | } | 825 | } |
781 | 826 | ||
782 | goto done_nolock; | 827 | goto done_nolock; |
@@ -796,6 +841,21 @@ done_nolock: | |||
796 | remove_rbio_from_cache(rbio); | 841 | remove_rbio_from_cache(rbio); |
797 | } | 842 | } |
798 | 843 | ||
844 | static inline void | ||
845 | __free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need) | ||
846 | { | ||
847 | if (need) { | ||
848 | kfree(raid_map); | ||
849 | kfree(bbio); | ||
850 | } | ||
851 | } | ||
852 | |||
853 | static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio) | ||
854 | { | ||
855 | __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map, | ||
856 | !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)); | ||
857 | } | ||
858 | |||
799 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | 859 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) |
800 | { | 860 | { |
801 | int i; | 861 | int i; |
@@ -814,8 +874,9 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) | |||
814 | rbio->stripe_pages[i] = NULL; | 874 | rbio->stripe_pages[i] = NULL; |
815 | } | 875 | } |
816 | } | 876 | } |
817 | kfree(rbio->raid_map); | 877 | |
818 | kfree(rbio->bbio); | 878 | free_bbio_and_raid_map(rbio); |
879 | |||
819 | kfree(rbio); | 880 | kfree(rbio); |
820 | } | 881 | } |
821 | 882 | ||
@@ -833,6 +894,10 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | |||
833 | { | 894 | { |
834 | struct bio *cur = bio_list_get(&rbio->bio_list); | 895 | struct bio *cur = bio_list_get(&rbio->bio_list); |
835 | struct bio *next; | 896 | struct bio *next; |
897 | |||
898 | if (rbio->generic_bio_cnt) | ||
899 | btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); | ||
900 | |||
836 | free_raid_bio(rbio); | 901 | free_raid_bio(rbio); |
837 | 902 | ||
838 | while (cur) { | 903 | while (cur) { |
@@ -858,13 +923,13 @@ static void raid_write_end_io(struct bio *bio, int err) | |||
858 | 923 | ||
859 | bio_put(bio); | 924 | bio_put(bio); |
860 | 925 | ||
861 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 926 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
862 | return; | 927 | return; |
863 | 928 | ||
864 | err = 0; | 929 | err = 0; |
865 | 930 | ||
866 | /* OK, we have read all the stripes we need to. */ | 931 | /* OK, we have read all the stripes we need to. */ |
867 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 932 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
868 | err = -EIO; | 933 | err = -EIO; |
869 | 934 | ||
870 | rbio_orig_end_io(rbio, err, 0); | 935 | rbio_orig_end_io(rbio, err, 0); |
@@ -925,16 +990,16 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
925 | { | 990 | { |
926 | struct btrfs_raid_bio *rbio; | 991 | struct btrfs_raid_bio *rbio; |
927 | int nr_data = 0; | 992 | int nr_data = 0; |
928 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | 993 | int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; |
994 | int num_pages = rbio_nr_pages(stripe_len, real_stripes); | ||
995 | int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); | ||
929 | void *p; | 996 | void *p; |
930 | 997 | ||
931 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | 998 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + |
999 | DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), | ||
932 | GFP_NOFS); | 1000 | GFP_NOFS); |
933 | if (!rbio) { | 1001 | if (!rbio) |
934 | kfree(raid_map); | ||
935 | kfree(bbio); | ||
936 | return ERR_PTR(-ENOMEM); | 1002 | return ERR_PTR(-ENOMEM); |
937 | } | ||
938 | 1003 | ||
939 | bio_list_init(&rbio->bio_list); | 1004 | bio_list_init(&rbio->bio_list); |
940 | INIT_LIST_HEAD(&rbio->plug_list); | 1005 | INIT_LIST_HEAD(&rbio->plug_list); |
@@ -946,9 +1011,13 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
946 | rbio->fs_info = root->fs_info; | 1011 | rbio->fs_info = root->fs_info; |
947 | rbio->stripe_len = stripe_len; | 1012 | rbio->stripe_len = stripe_len; |
948 | rbio->nr_pages = num_pages; | 1013 | rbio->nr_pages = num_pages; |
1014 | rbio->real_stripes = real_stripes; | ||
1015 | rbio->stripe_npages = stripe_npages; | ||
949 | rbio->faila = -1; | 1016 | rbio->faila = -1; |
950 | rbio->failb = -1; | 1017 | rbio->failb = -1; |
951 | atomic_set(&rbio->refs, 1); | 1018 | atomic_set(&rbio->refs, 1); |
1019 | atomic_set(&rbio->error, 0); | ||
1020 | atomic_set(&rbio->stripes_pending, 0); | ||
952 | 1021 | ||
953 | /* | 1022 | /* |
954 | * the stripe_pages and bio_pages array point to the extra | 1023 | * the stripe_pages and bio_pages array point to the extra |
@@ -957,11 +1026,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | |||
957 | p = rbio + 1; | 1026 | p = rbio + 1; |
958 | rbio->stripe_pages = p; | 1027 | rbio->stripe_pages = p; |
959 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | 1028 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; |
1029 | rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2; | ||
960 | 1030 | ||
961 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | 1031 | if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE) |
962 | nr_data = bbio->num_stripes - 2; | 1032 | nr_data = real_stripes - 2; |
963 | else | 1033 | else |
964 | nr_data = bbio->num_stripes - 1; | 1034 | nr_data = real_stripes - 1; |
965 | 1035 | ||
966 | rbio->nr_data = nr_data; | 1036 | rbio->nr_data = nr_data; |
967 | return rbio; | 1037 | return rbio; |
@@ -1073,7 +1143,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, | |||
1073 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | 1143 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) |
1074 | { | 1144 | { |
1075 | if (rbio->faila >= 0 || rbio->failb >= 0) { | 1145 | if (rbio->faila >= 0 || rbio->failb >= 0) { |
1076 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | 1146 | BUG_ON(rbio->faila == rbio->real_stripes - 1); |
1077 | __raid56_parity_recover(rbio); | 1147 | __raid56_parity_recover(rbio); |
1078 | } else { | 1148 | } else { |
1079 | finish_rmw(rbio); | 1149 | finish_rmw(rbio); |
@@ -1134,7 +1204,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) | |||
1134 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | 1204 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) |
1135 | { | 1205 | { |
1136 | struct btrfs_bio *bbio = rbio->bbio; | 1206 | struct btrfs_bio *bbio = rbio->bbio; |
1137 | void *pointers[bbio->num_stripes]; | 1207 | void *pointers[rbio->real_stripes]; |
1138 | int stripe_len = rbio->stripe_len; | 1208 | int stripe_len = rbio->stripe_len; |
1139 | int nr_data = rbio->nr_data; | 1209 | int nr_data = rbio->nr_data; |
1140 | int stripe; | 1210 | int stripe; |
@@ -1148,11 +1218,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1148 | 1218 | ||
1149 | bio_list_init(&bio_list); | 1219 | bio_list_init(&bio_list); |
1150 | 1220 | ||
1151 | if (bbio->num_stripes - rbio->nr_data == 1) { | 1221 | if (rbio->real_stripes - rbio->nr_data == 1) { |
1152 | p_stripe = bbio->num_stripes - 1; | 1222 | p_stripe = rbio->real_stripes - 1; |
1153 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | 1223 | } else if (rbio->real_stripes - rbio->nr_data == 2) { |
1154 | p_stripe = bbio->num_stripes - 2; | 1224 | p_stripe = rbio->real_stripes - 2; |
1155 | q_stripe = bbio->num_stripes - 1; | 1225 | q_stripe = rbio->real_stripes - 1; |
1156 | } else { | 1226 | } else { |
1157 | BUG(); | 1227 | BUG(); |
1158 | } | 1228 | } |
@@ -1169,7 +1239,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1169 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | 1239 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); |
1170 | spin_unlock_irq(&rbio->bio_list_lock); | 1240 | spin_unlock_irq(&rbio->bio_list_lock); |
1171 | 1241 | ||
1172 | atomic_set(&rbio->bbio->error, 0); | 1242 | atomic_set(&rbio->error, 0); |
1173 | 1243 | ||
1174 | /* | 1244 | /* |
1175 | * now that we've set rmw_locked, run through the | 1245 | * now that we've set rmw_locked, run through the |
@@ -1209,7 +1279,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1209 | SetPageUptodate(p); | 1279 | SetPageUptodate(p); |
1210 | pointers[stripe++] = kmap(p); | 1280 | pointers[stripe++] = kmap(p); |
1211 | 1281 | ||
1212 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | 1282 | raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, |
1213 | pointers); | 1283 | pointers); |
1214 | } else { | 1284 | } else { |
1215 | /* raid5 */ | 1285 | /* raid5 */ |
@@ -1218,7 +1288,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1218 | } | 1288 | } |
1219 | 1289 | ||
1220 | 1290 | ||
1221 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | 1291 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) |
1222 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | 1292 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); |
1223 | } | 1293 | } |
1224 | 1294 | ||
@@ -1227,7 +1297,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1227 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | 1297 | * higher layers (the bio_list in our rbio) and our p/q. Ignore |
1228 | * everything else. | 1298 | * everything else. |
1229 | */ | 1299 | */ |
1230 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | 1300 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1231 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | 1301 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { |
1232 | struct page *page; | 1302 | struct page *page; |
1233 | if (stripe < rbio->nr_data) { | 1303 | if (stripe < rbio->nr_data) { |
@@ -1245,8 +1315,34 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | |||
1245 | } | 1315 | } |
1246 | } | 1316 | } |
1247 | 1317 | ||
1248 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | 1318 | if (likely(!bbio->num_tgtdevs)) |
1249 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | 1319 | goto write_data; |
1320 | |||
1321 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { | ||
1322 | if (!bbio->tgtdev_map[stripe]) | ||
1323 | continue; | ||
1324 | |||
1325 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
1326 | struct page *page; | ||
1327 | if (stripe < rbio->nr_data) { | ||
1328 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
1329 | if (!page) | ||
1330 | continue; | ||
1331 | } else { | ||
1332 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
1333 | } | ||
1334 | |||
1335 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
1336 | rbio->bbio->tgtdev_map[stripe], | ||
1337 | pagenr, rbio->stripe_len); | ||
1338 | if (ret) | ||
1339 | goto cleanup; | ||
1340 | } | ||
1341 | } | ||
1342 | |||
1343 | write_data: | ||
1344 | atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); | ||
1345 | BUG_ON(atomic_read(&rbio->stripes_pending) == 0); | ||
1250 | 1346 | ||
1251 | while (1) { | 1347 | while (1) { |
1252 | bio = bio_list_pop(&bio_list); | 1348 | bio = bio_list_pop(&bio_list); |
@@ -1283,7 +1379,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, | |||
1283 | stripe = &rbio->bbio->stripes[i]; | 1379 | stripe = &rbio->bbio->stripes[i]; |
1284 | stripe_start = stripe->physical; | 1380 | stripe_start = stripe->physical; |
1285 | if (physical >= stripe_start && | 1381 | if (physical >= stripe_start && |
1286 | physical < stripe_start + rbio->stripe_len) { | 1382 | physical < stripe_start + rbio->stripe_len && |
1383 | bio->bi_bdev == stripe->dev->bdev) { | ||
1287 | return i; | 1384 | return i; |
1288 | } | 1385 | } |
1289 | } | 1386 | } |
@@ -1331,11 +1428,11 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | |||
1331 | if (rbio->faila == -1) { | 1428 | if (rbio->faila == -1) { |
1332 | /* first failure on this rbio */ | 1429 | /* first failure on this rbio */ |
1333 | rbio->faila = failed; | 1430 | rbio->faila = failed; |
1334 | atomic_inc(&rbio->bbio->error); | 1431 | atomic_inc(&rbio->error); |
1335 | } else if (rbio->failb == -1) { | 1432 | } else if (rbio->failb == -1) { |
1336 | /* second failure on this rbio */ | 1433 | /* second failure on this rbio */ |
1337 | rbio->failb = failed; | 1434 | rbio->failb = failed; |
1338 | atomic_inc(&rbio->bbio->error); | 1435 | atomic_inc(&rbio->error); |
1339 | } else { | 1436 | } else { |
1340 | ret = -EIO; | 1437 | ret = -EIO; |
1341 | } | 1438 | } |
@@ -1394,11 +1491,11 @@ static void raid_rmw_end_io(struct bio *bio, int err) | |||
1394 | 1491 | ||
1395 | bio_put(bio); | 1492 | bio_put(bio); |
1396 | 1493 | ||
1397 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 1494 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
1398 | return; | 1495 | return; |
1399 | 1496 | ||
1400 | err = 0; | 1497 | err = 0; |
1401 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 1498 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
1402 | goto cleanup; | 1499 | goto cleanup; |
1403 | 1500 | ||
1404 | /* | 1501 | /* |
@@ -1439,7 +1536,6 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio) | |||
1439 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | 1536 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) |
1440 | { | 1537 | { |
1441 | int bios_to_read = 0; | 1538 | int bios_to_read = 0; |
1442 | struct btrfs_bio *bbio = rbio->bbio; | ||
1443 | struct bio_list bio_list; | 1539 | struct bio_list bio_list; |
1444 | int ret; | 1540 | int ret; |
1445 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); | 1541 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); |
@@ -1455,7 +1551,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | |||
1455 | 1551 | ||
1456 | index_rbio_pages(rbio); | 1552 | index_rbio_pages(rbio); |
1457 | 1553 | ||
1458 | atomic_set(&rbio->bbio->error, 0); | 1554 | atomic_set(&rbio->error, 0); |
1459 | /* | 1555 | /* |
1460 | * build a list of bios to read all the missing parts of this | 1556 | * build a list of bios to read all the missing parts of this |
1461 | * stripe | 1557 | * stripe |
@@ -1503,7 +1599,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | |||
1503 | * the bbio may be freed once we submit the last bio. Make sure | 1599 | * the bbio may be freed once we submit the last bio. Make sure |
1504 | * not to touch it after that | 1600 | * not to touch it after that |
1505 | */ | 1601 | */ |
1506 | atomic_set(&bbio->stripes_pending, bios_to_read); | 1602 | atomic_set(&rbio->stripes_pending, bios_to_read); |
1507 | while (1) { | 1603 | while (1) { |
1508 | bio = bio_list_pop(&bio_list); | 1604 | bio = bio_list_pop(&bio_list); |
1509 | if (!bio) | 1605 | if (!bio) |
@@ -1686,19 +1782,30 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | |||
1686 | struct btrfs_raid_bio *rbio; | 1782 | struct btrfs_raid_bio *rbio; |
1687 | struct btrfs_plug_cb *plug = NULL; | 1783 | struct btrfs_plug_cb *plug = NULL; |
1688 | struct blk_plug_cb *cb; | 1784 | struct blk_plug_cb *cb; |
1785 | int ret; | ||
1689 | 1786 | ||
1690 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | 1787 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); |
1691 | if (IS_ERR(rbio)) | 1788 | if (IS_ERR(rbio)) { |
1789 | __free_bbio_and_raid_map(bbio, raid_map, 1); | ||
1692 | return PTR_ERR(rbio); | 1790 | return PTR_ERR(rbio); |
1791 | } | ||
1693 | bio_list_add(&rbio->bio_list, bio); | 1792 | bio_list_add(&rbio->bio_list, bio); |
1694 | rbio->bio_list_bytes = bio->bi_iter.bi_size; | 1793 | rbio->bio_list_bytes = bio->bi_iter.bi_size; |
1794 | rbio->operation = BTRFS_RBIO_WRITE; | ||
1795 | |||
1796 | btrfs_bio_counter_inc_noblocked(root->fs_info); | ||
1797 | rbio->generic_bio_cnt = 1; | ||
1695 | 1798 | ||
1696 | /* | 1799 | /* |
1697 | * don't plug on full rbios, just get them out the door | 1800 | * don't plug on full rbios, just get them out the door |
1698 | * as quickly as we can | 1801 | * as quickly as we can |
1699 | */ | 1802 | */ |
1700 | if (rbio_is_full(rbio)) | 1803 | if (rbio_is_full(rbio)) { |
1701 | return full_stripe_write(rbio); | 1804 | ret = full_stripe_write(rbio); |
1805 | if (ret) | ||
1806 | btrfs_bio_counter_dec(root->fs_info); | ||
1807 | return ret; | ||
1808 | } | ||
1702 | 1809 | ||
1703 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, | 1810 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, |
1704 | sizeof(*plug)); | 1811 | sizeof(*plug)); |
@@ -1709,10 +1816,13 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | |||
1709 | INIT_LIST_HEAD(&plug->rbio_list); | 1816 | INIT_LIST_HEAD(&plug->rbio_list); |
1710 | } | 1817 | } |
1711 | list_add_tail(&rbio->plug_list, &plug->rbio_list); | 1818 | list_add_tail(&rbio->plug_list, &plug->rbio_list); |
1819 | ret = 0; | ||
1712 | } else { | 1820 | } else { |
1713 | return __raid56_parity_write(rbio); | 1821 | ret = __raid56_parity_write(rbio); |
1822 | if (ret) | ||
1823 | btrfs_bio_counter_dec(root->fs_info); | ||
1714 | } | 1824 | } |
1715 | return 0; | 1825 | return ret; |
1716 | } | 1826 | } |
1717 | 1827 | ||
1718 | /* | 1828 | /* |
@@ -1730,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1730 | int err; | 1840 | int err; |
1731 | int i; | 1841 | int i; |
1732 | 1842 | ||
1733 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | 1843 | pointers = kzalloc(rbio->real_stripes * sizeof(void *), |
1734 | GFP_NOFS); | 1844 | GFP_NOFS); |
1735 | if (!pointers) { | 1845 | if (!pointers) { |
1736 | err = -ENOMEM; | 1846 | err = -ENOMEM; |
@@ -1740,7 +1850,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1740 | faila = rbio->faila; | 1850 | faila = rbio->faila; |
1741 | failb = rbio->failb; | 1851 | failb = rbio->failb; |
1742 | 1852 | ||
1743 | if (rbio->read_rebuild) { | 1853 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { |
1744 | spin_lock_irq(&rbio->bio_list_lock); | 1854 | spin_lock_irq(&rbio->bio_list_lock); |
1745 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | 1855 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); |
1746 | spin_unlock_irq(&rbio->bio_list_lock); | 1856 | spin_unlock_irq(&rbio->bio_list_lock); |
@@ -1749,15 +1859,23 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1749 | index_rbio_pages(rbio); | 1859 | index_rbio_pages(rbio); |
1750 | 1860 | ||
1751 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | 1861 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { |
1862 | /* | ||
1863 | * Now we just use bitmap to mark the horizontal stripes in | ||
1864 | * which we have data when doing parity scrub. | ||
1865 | */ | ||
1866 | if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && | ||
1867 | !test_bit(pagenr, rbio->dbitmap)) | ||
1868 | continue; | ||
1869 | |||
1752 | /* setup our array of pointers with pages | 1870 | /* setup our array of pointers with pages |
1753 | * from each stripe | 1871 | * from each stripe |
1754 | */ | 1872 | */ |
1755 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | 1873 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1756 | /* | 1874 | /* |
1757 | * if we're rebuilding a read, we have to use | 1875 | * if we're rebuilding a read, we have to use |
1758 | * pages from the bio list | 1876 | * pages from the bio list |
1759 | */ | 1877 | */ |
1760 | if (rbio->read_rebuild && | 1878 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD && |
1761 | (stripe == faila || stripe == failb)) { | 1879 | (stripe == faila || stripe == failb)) { |
1762 | page = page_in_rbio(rbio, stripe, pagenr, 0); | 1880 | page = page_in_rbio(rbio, stripe, pagenr, 0); |
1763 | } else { | 1881 | } else { |
@@ -1767,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1767 | } | 1885 | } |
1768 | 1886 | ||
1769 | /* all raid6 handling here */ | 1887 | /* all raid6 handling here */ |
1770 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | 1888 | if (rbio->raid_map[rbio->real_stripes - 1] == |
1771 | RAID6_Q_STRIPE) { | 1889 | RAID6_Q_STRIPE) { |
1772 | 1890 | ||
1773 | /* | 1891 | /* |
@@ -1817,10 +1935,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | |||
1817 | } | 1935 | } |
1818 | 1936 | ||
1819 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | 1937 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { |
1820 | raid6_datap_recov(rbio->bbio->num_stripes, | 1938 | raid6_datap_recov(rbio->real_stripes, |
1821 | PAGE_SIZE, faila, pointers); | 1939 | PAGE_SIZE, faila, pointers); |
1822 | } else { | 1940 | } else { |
1823 | raid6_2data_recov(rbio->bbio->num_stripes, | 1941 | raid6_2data_recov(rbio->real_stripes, |
1824 | PAGE_SIZE, faila, failb, | 1942 | PAGE_SIZE, faila, failb, |
1825 | pointers); | 1943 | pointers); |
1826 | } | 1944 | } |
@@ -1850,7 +1968,7 @@ pstripe: | |||
1850 | * know they can be trusted. If this was a read reconstruction, | 1968 | * know they can be trusted. If this was a read reconstruction, |
1851 | * other endio functions will fiddle the uptodate bits | 1969 | * other endio functions will fiddle the uptodate bits |
1852 | */ | 1970 | */ |
1853 | if (!rbio->read_rebuild) { | 1971 | if (rbio->operation == BTRFS_RBIO_WRITE) { |
1854 | for (i = 0; i < nr_pages; i++) { | 1972 | for (i = 0; i < nr_pages; i++) { |
1855 | if (faila != -1) { | 1973 | if (faila != -1) { |
1856 | page = rbio_stripe_page(rbio, faila, i); | 1974 | page = rbio_stripe_page(rbio, faila, i); |
@@ -1862,12 +1980,12 @@ pstripe: | |||
1862 | } | 1980 | } |
1863 | } | 1981 | } |
1864 | } | 1982 | } |
1865 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | 1983 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1866 | /* | 1984 | /* |
1867 | * if we're rebuilding a read, we have to use | 1985 | * if we're rebuilding a read, we have to use |
1868 | * pages from the bio list | 1986 | * pages from the bio list |
1869 | */ | 1987 | */ |
1870 | if (rbio->read_rebuild && | 1988 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD && |
1871 | (stripe == faila || stripe == failb)) { | 1989 | (stripe == faila || stripe == failb)) { |
1872 | page = page_in_rbio(rbio, stripe, pagenr, 0); | 1990 | page = page_in_rbio(rbio, stripe, pagenr, 0); |
1873 | } else { | 1991 | } else { |
@@ -1882,9 +2000,9 @@ cleanup: | |||
1882 | kfree(pointers); | 2000 | kfree(pointers); |
1883 | 2001 | ||
1884 | cleanup_io: | 2002 | cleanup_io: |
1885 | 2003 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { | |
1886 | if (rbio->read_rebuild) { | 2004 | if (err == 0 && |
1887 | if (err == 0) | 2005 | !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags)) |
1888 | cache_rbio_pages(rbio); | 2006 | cache_rbio_pages(rbio); |
1889 | else | 2007 | else |
1890 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | 2008 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); |
@@ -1893,7 +2011,13 @@ cleanup_io: | |||
1893 | } else if (err == 0) { | 2011 | } else if (err == 0) { |
1894 | rbio->faila = -1; | 2012 | rbio->faila = -1; |
1895 | rbio->failb = -1; | 2013 | rbio->failb = -1; |
1896 | finish_rmw(rbio); | 2014 | |
2015 | if (rbio->operation == BTRFS_RBIO_WRITE) | ||
2016 | finish_rmw(rbio); | ||
2017 | else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) | ||
2018 | finish_parity_scrub(rbio, 0); | ||
2019 | else | ||
2020 | BUG(); | ||
1897 | } else { | 2021 | } else { |
1898 | rbio_orig_end_io(rbio, err, 0); | 2022 | rbio_orig_end_io(rbio, err, 0); |
1899 | } | 2023 | } |
@@ -1917,10 +2041,10 @@ static void raid_recover_end_io(struct bio *bio, int err) | |||
1917 | set_bio_pages_uptodate(bio); | 2041 | set_bio_pages_uptodate(bio); |
1918 | bio_put(bio); | 2042 | bio_put(bio); |
1919 | 2043 | ||
1920 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | 2044 | if (!atomic_dec_and_test(&rbio->stripes_pending)) |
1921 | return; | 2045 | return; |
1922 | 2046 | ||
1923 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | 2047 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) |
1924 | rbio_orig_end_io(rbio, -EIO, 0); | 2048 | rbio_orig_end_io(rbio, -EIO, 0); |
1925 | else | 2049 | else |
1926 | __raid_recover_end_io(rbio); | 2050 | __raid_recover_end_io(rbio); |
@@ -1937,7 +2061,6 @@ static void raid_recover_end_io(struct bio *bio, int err) | |||
1937 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | 2061 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) |
1938 | { | 2062 | { |
1939 | int bios_to_read = 0; | 2063 | int bios_to_read = 0; |
1940 | struct btrfs_bio *bbio = rbio->bbio; | ||
1941 | struct bio_list bio_list; | 2064 | struct bio_list bio_list; |
1942 | int ret; | 2065 | int ret; |
1943 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); | 2066 | int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); |
@@ -1951,16 +2074,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
1951 | if (ret) | 2074 | if (ret) |
1952 | goto cleanup; | 2075 | goto cleanup; |
1953 | 2076 | ||
1954 | atomic_set(&rbio->bbio->error, 0); | 2077 | atomic_set(&rbio->error, 0); |
1955 | 2078 | ||
1956 | /* | 2079 | /* |
1957 | * read everything that hasn't failed. Thanks to the | 2080 | * read everything that hasn't failed. Thanks to the |
1958 | * stripe cache, it is possible that some or all of these | 2081 | * stripe cache, it is possible that some or all of these |
1959 | * pages are going to be uptodate. | 2082 | * pages are going to be uptodate. |
1960 | */ | 2083 | */ |
1961 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | 2084 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { |
1962 | if (rbio->faila == stripe || rbio->failb == stripe) { | 2085 | if (rbio->faila == stripe || rbio->failb == stripe) { |
1963 | atomic_inc(&rbio->bbio->error); | 2086 | atomic_inc(&rbio->error); |
1964 | continue; | 2087 | continue; |
1965 | } | 2088 | } |
1966 | 2089 | ||
@@ -1990,7 +2113,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
1990 | * were up to date, or we might have no bios to read because | 2113 | * were up to date, or we might have no bios to read because |
1991 | * the devices were gone. | 2114 | * the devices were gone. |
1992 | */ | 2115 | */ |
1993 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | 2116 | if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { |
1994 | __raid_recover_end_io(rbio); | 2117 | __raid_recover_end_io(rbio); |
1995 | goto out; | 2118 | goto out; |
1996 | } else { | 2119 | } else { |
@@ -2002,7 +2125,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | |||
2002 | * the bbio may be freed once we submit the last bio. Make sure | 2125 | * the bbio may be freed once we submit the last bio. Make sure |
2003 | * not to touch it after that | 2126 | * not to touch it after that |
2004 | */ | 2127 | */ |
2005 | atomic_set(&bbio->stripes_pending, bios_to_read); | 2128 | atomic_set(&rbio->stripes_pending, bios_to_read); |
2006 | while (1) { | 2129 | while (1) { |
2007 | bio = bio_list_pop(&bio_list); | 2130 | bio = bio_list_pop(&bio_list); |
2008 | if (!bio) | 2131 | if (!bio) |
@@ -2021,7 +2144,7 @@ out: | |||
2021 | return 0; | 2144 | return 0; |
2022 | 2145 | ||
2023 | cleanup: | 2146 | cleanup: |
2024 | if (rbio->read_rebuild) | 2147 | if (rbio->operation == BTRFS_RBIO_READ_REBUILD) |
2025 | rbio_orig_end_io(rbio, -EIO, 0); | 2148 | rbio_orig_end_io(rbio, -EIO, 0); |
2026 | return -EIO; | 2149 | return -EIO; |
2027 | } | 2150 | } |
@@ -2034,34 +2157,42 @@ cleanup: | |||
2034 | */ | 2157 | */ |
2035 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | 2158 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, |
2036 | struct btrfs_bio *bbio, u64 *raid_map, | 2159 | struct btrfs_bio *bbio, u64 *raid_map, |
2037 | u64 stripe_len, int mirror_num) | 2160 | u64 stripe_len, int mirror_num, int generic_io) |
2038 | { | 2161 | { |
2039 | struct btrfs_raid_bio *rbio; | 2162 | struct btrfs_raid_bio *rbio; |
2040 | int ret; | 2163 | int ret; |
2041 | 2164 | ||
2042 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | 2165 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); |
2043 | if (IS_ERR(rbio)) | 2166 | if (IS_ERR(rbio)) { |
2167 | __free_bbio_and_raid_map(bbio, raid_map, generic_io); | ||
2044 | return PTR_ERR(rbio); | 2168 | return PTR_ERR(rbio); |
2169 | } | ||
2045 | 2170 | ||
2046 | rbio->read_rebuild = 1; | 2171 | rbio->operation = BTRFS_RBIO_READ_REBUILD; |
2047 | bio_list_add(&rbio->bio_list, bio); | 2172 | bio_list_add(&rbio->bio_list, bio); |
2048 | rbio->bio_list_bytes = bio->bi_iter.bi_size; | 2173 | rbio->bio_list_bytes = bio->bi_iter.bi_size; |
2049 | 2174 | ||
2050 | rbio->faila = find_logical_bio_stripe(rbio, bio); | 2175 | rbio->faila = find_logical_bio_stripe(rbio, bio); |
2051 | if (rbio->faila == -1) { | 2176 | if (rbio->faila == -1) { |
2052 | BUG(); | 2177 | BUG(); |
2053 | kfree(raid_map); | 2178 | __free_bbio_and_raid_map(bbio, raid_map, generic_io); |
2054 | kfree(bbio); | ||
2055 | kfree(rbio); | 2179 | kfree(rbio); |
2056 | return -EIO; | 2180 | return -EIO; |
2057 | } | 2181 | } |
2058 | 2182 | ||
2183 | if (generic_io) { | ||
2184 | btrfs_bio_counter_inc_noblocked(root->fs_info); | ||
2185 | rbio->generic_bio_cnt = 1; | ||
2186 | } else { | ||
2187 | set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags); | ||
2188 | } | ||
2189 | |||
2059 | /* | 2190 | /* |
2060 | * reconstruct from the q stripe if they are | 2191 | * reconstruct from the q stripe if they are |
2061 | * asking for mirror 3 | 2192 | * asking for mirror 3 |
2062 | */ | 2193 | */ |
2063 | if (mirror_num == 3) | 2194 | if (mirror_num == 3) |
2064 | rbio->failb = bbio->num_stripes - 2; | 2195 | rbio->failb = rbio->real_stripes - 2; |
2065 | 2196 | ||
2066 | ret = lock_stripe_add(rbio); | 2197 | ret = lock_stripe_add(rbio); |
2067 | 2198 | ||
@@ -2098,3 +2229,483 @@ static void read_rebuild_work(struct btrfs_work *work) | |||
2098 | rbio = container_of(work, struct btrfs_raid_bio, work); | 2229 | rbio = container_of(work, struct btrfs_raid_bio, work); |
2099 | __raid56_parity_recover(rbio); | 2230 | __raid56_parity_recover(rbio); |
2100 | } | 2231 | } |
2232 | |||
2233 | /* | ||
2234 | * The following code is used to scrub/replace the parity stripe | ||
2235 | * | ||
2236 | * Note: We need make sure all the pages that add into the scrub/replace | ||
2237 | * raid bio are correct and not be changed during the scrub/replace. That | ||
2238 | * is those pages just hold metadata or file data with checksum. | ||
2239 | */ | ||
2240 | |||
2241 | struct btrfs_raid_bio * | ||
2242 | raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, | ||
2243 | struct btrfs_bio *bbio, u64 *raid_map, | ||
2244 | u64 stripe_len, struct btrfs_device *scrub_dev, | ||
2245 | unsigned long *dbitmap, int stripe_nsectors) | ||
2246 | { | ||
2247 | struct btrfs_raid_bio *rbio; | ||
2248 | int i; | ||
2249 | |||
2250 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
2251 | if (IS_ERR(rbio)) | ||
2252 | return NULL; | ||
2253 | bio_list_add(&rbio->bio_list, bio); | ||
2254 | /* | ||
2255 | * This is a special bio which is used to hold the completion handler | ||
2256 | * and make the scrub rbio is similar to the other types | ||
2257 | */ | ||
2258 | ASSERT(!bio->bi_iter.bi_size); | ||
2259 | rbio->operation = BTRFS_RBIO_PARITY_SCRUB; | ||
2260 | |||
2261 | for (i = 0; i < rbio->real_stripes; i++) { | ||
2262 | if (bbio->stripes[i].dev == scrub_dev) { | ||
2263 | rbio->scrubp = i; | ||
2264 | break; | ||
2265 | } | ||
2266 | } | ||
2267 | |||
2268 | /* Now we just support the sectorsize equals to page size */ | ||
2269 | ASSERT(root->sectorsize == PAGE_SIZE); | ||
2270 | ASSERT(rbio->stripe_npages == stripe_nsectors); | ||
2271 | bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); | ||
2272 | |||
2273 | return rbio; | ||
2274 | } | ||
2275 | |||
2276 | void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, | ||
2277 | struct page *page, u64 logical) | ||
2278 | { | ||
2279 | int stripe_offset; | ||
2280 | int index; | ||
2281 | |||
2282 | ASSERT(logical >= rbio->raid_map[0]); | ||
2283 | ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] + | ||
2284 | rbio->stripe_len * rbio->nr_data); | ||
2285 | stripe_offset = (int)(logical - rbio->raid_map[0]); | ||
2286 | index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
2287 | rbio->bio_pages[index] = page; | ||
2288 | } | ||
2289 | |||
2290 | /* | ||
2291 | * We just scrub the parity that we have correct data on the same horizontal, | ||
2292 | * so we needn't allocate all pages for all the stripes. | ||
2293 | */ | ||
2294 | static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) | ||
2295 | { | ||
2296 | int i; | ||
2297 | int bit; | ||
2298 | int index; | ||
2299 | struct page *page; | ||
2300 | |||
2301 | for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { | ||
2302 | for (i = 0; i < rbio->real_stripes; i++) { | ||
2303 | index = i * rbio->stripe_npages + bit; | ||
2304 | if (rbio->stripe_pages[index]) | ||
2305 | continue; | ||
2306 | |||
2307 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2308 | if (!page) | ||
2309 | return -ENOMEM; | ||
2310 | rbio->stripe_pages[index] = page; | ||
2311 | ClearPageUptodate(page); | ||
2312 | } | ||
2313 | } | ||
2314 | return 0; | ||
2315 | } | ||
2316 | |||
2317 | /* | ||
2318 | * end io function used by finish_rmw. When we finally | ||
2319 | * get here, we've written a full stripe | ||
2320 | */ | ||
2321 | static void raid_write_parity_end_io(struct bio *bio, int err) | ||
2322 | { | ||
2323 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
2324 | |||
2325 | if (err) | ||
2326 | fail_bio_stripe(rbio, bio); | ||
2327 | |||
2328 | bio_put(bio); | ||
2329 | |||
2330 | if (!atomic_dec_and_test(&rbio->stripes_pending)) | ||
2331 | return; | ||
2332 | |||
2333 | err = 0; | ||
2334 | |||
2335 | if (atomic_read(&rbio->error)) | ||
2336 | err = -EIO; | ||
2337 | |||
2338 | rbio_orig_end_io(rbio, err, 0); | ||
2339 | } | ||
2340 | |||
2341 | static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, | ||
2342 | int need_check) | ||
2343 | { | ||
2344 | struct btrfs_bio *bbio = rbio->bbio; | ||
2345 | void *pointers[rbio->real_stripes]; | ||
2346 | DECLARE_BITMAP(pbitmap, rbio->stripe_npages); | ||
2347 | int nr_data = rbio->nr_data; | ||
2348 | int stripe; | ||
2349 | int pagenr; | ||
2350 | int p_stripe = -1; | ||
2351 | int q_stripe = -1; | ||
2352 | struct page *p_page = NULL; | ||
2353 | struct page *q_page = NULL; | ||
2354 | struct bio_list bio_list; | ||
2355 | struct bio *bio; | ||
2356 | int is_replace = 0; | ||
2357 | int ret; | ||
2358 | |||
2359 | bio_list_init(&bio_list); | ||
2360 | |||
2361 | if (rbio->real_stripes - rbio->nr_data == 1) { | ||
2362 | p_stripe = rbio->real_stripes - 1; | ||
2363 | } else if (rbio->real_stripes - rbio->nr_data == 2) { | ||
2364 | p_stripe = rbio->real_stripes - 2; | ||
2365 | q_stripe = rbio->real_stripes - 1; | ||
2366 | } else { | ||
2367 | BUG(); | ||
2368 | } | ||
2369 | |||
2370 | if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { | ||
2371 | is_replace = 1; | ||
2372 | bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); | ||
2373 | } | ||
2374 | |||
2375 | /* | ||
2376 | * Because the higher layers(scrubber) are unlikely to | ||
2377 | * use this area of the disk again soon, so don't cache | ||
2378 | * it. | ||
2379 | */ | ||
2380 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
2381 | |||
2382 | if (!need_check) | ||
2383 | goto writeback; | ||
2384 | |||
2385 | p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2386 | if (!p_page) | ||
2387 | goto cleanup; | ||
2388 | SetPageUptodate(p_page); | ||
2389 | |||
2390 | if (q_stripe != -1) { | ||
2391 | q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
2392 | if (!q_page) { | ||
2393 | __free_page(p_page); | ||
2394 | goto cleanup; | ||
2395 | } | ||
2396 | SetPageUptodate(q_page); | ||
2397 | } | ||
2398 | |||
2399 | atomic_set(&rbio->error, 0); | ||
2400 | |||
2401 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2402 | struct page *p; | ||
2403 | void *parity; | ||
2404 | /* first collect one page from each data stripe */ | ||
2405 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
2406 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
2407 | pointers[stripe] = kmap(p); | ||
2408 | } | ||
2409 | |||
2410 | /* then add the parity stripe */ | ||
2411 | pointers[stripe++] = kmap(p_page); | ||
2412 | |||
2413 | if (q_stripe != -1) { | ||
2414 | |||
2415 | /* | ||
2416 | * raid6, add the qstripe and call the | ||
2417 | * library function to fill in our p/q | ||
2418 | */ | ||
2419 | pointers[stripe++] = kmap(q_page); | ||
2420 | |||
2421 | raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, | ||
2422 | pointers); | ||
2423 | } else { | ||
2424 | /* raid5 */ | ||
2425 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
2426 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
2427 | } | ||
2428 | |||
2429 | /* Check scrubbing pairty and repair it */ | ||
2430 | p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2431 | parity = kmap(p); | ||
2432 | if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE)) | ||
2433 | memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE); | ||
2434 | else | ||
2435 | /* Parity is right, needn't writeback */ | ||
2436 | bitmap_clear(rbio->dbitmap, pagenr, 1); | ||
2437 | kunmap(p); | ||
2438 | |||
2439 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) | ||
2440 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
2441 | } | ||
2442 | |||
2443 | __free_page(p_page); | ||
2444 | if (q_page) | ||
2445 | __free_page(q_page); | ||
2446 | |||
2447 | writeback: | ||
2448 | /* | ||
2449 | * time to start writing. Make bios for everything from the | ||
2450 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
2451 | * everything else. | ||
2452 | */ | ||
2453 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2454 | struct page *page; | ||
2455 | |||
2456 | page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2457 | ret = rbio_add_io_page(rbio, &bio_list, | ||
2458 | page, rbio->scrubp, pagenr, rbio->stripe_len); | ||
2459 | if (ret) | ||
2460 | goto cleanup; | ||
2461 | } | ||
2462 | |||
2463 | if (!is_replace) | ||
2464 | goto submit_write; | ||
2465 | |||
2466 | for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { | ||
2467 | struct page *page; | ||
2468 | |||
2469 | page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); | ||
2470 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
2471 | bbio->tgtdev_map[rbio->scrubp], | ||
2472 | pagenr, rbio->stripe_len); | ||
2473 | if (ret) | ||
2474 | goto cleanup; | ||
2475 | } | ||
2476 | |||
2477 | submit_write: | ||
2478 | nr_data = bio_list_size(&bio_list); | ||
2479 | if (!nr_data) { | ||
2480 | /* Every parity is right */ | ||
2481 | rbio_orig_end_io(rbio, 0, 0); | ||
2482 | return; | ||
2483 | } | ||
2484 | |||
2485 | atomic_set(&rbio->stripes_pending, nr_data); | ||
2486 | |||
2487 | while (1) { | ||
2488 | bio = bio_list_pop(&bio_list); | ||
2489 | if (!bio) | ||
2490 | break; | ||
2491 | |||
2492 | bio->bi_private = rbio; | ||
2493 | bio->bi_end_io = raid_write_parity_end_io; | ||
2494 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
2495 | submit_bio(WRITE, bio); | ||
2496 | } | ||
2497 | return; | ||
2498 | |||
2499 | cleanup: | ||
2500 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2501 | } | ||
2502 | |||
2503 | static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) | ||
2504 | { | ||
2505 | if (stripe >= 0 && stripe < rbio->nr_data) | ||
2506 | return 1; | ||
2507 | return 0; | ||
2508 | } | ||
2509 | |||
2510 | /* | ||
2511 | * While we're doing the parity check and repair, we could have errors | ||
2512 | * in reading pages off the disk. This checks for errors and if we're | ||
2513 | * not able to read the page it'll trigger parity reconstruction. The | ||
2514 | * parity scrub will be finished after we've reconstructed the failed | ||
2515 | * stripes | ||
2516 | */ | ||
2517 | static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) | ||
2518 | { | ||
2519 | if (atomic_read(&rbio->error) > rbio->bbio->max_errors) | ||
2520 | goto cleanup; | ||
2521 | |||
2522 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
2523 | int dfail = 0, failp = -1; | ||
2524 | |||
2525 | if (is_data_stripe(rbio, rbio->faila)) | ||
2526 | dfail++; | ||
2527 | else if (is_parity_stripe(rbio->faila)) | ||
2528 | failp = rbio->faila; | ||
2529 | |||
2530 | if (is_data_stripe(rbio, rbio->failb)) | ||
2531 | dfail++; | ||
2532 | else if (is_parity_stripe(rbio->failb)) | ||
2533 | failp = rbio->failb; | ||
2534 | |||
2535 | /* | ||
2536 | * Because we can not use a scrubbing parity to repair | ||
2537 | * the data, so the capability of the repair is declined. | ||
2538 | * (In the case of RAID5, we can not repair anything) | ||
2539 | */ | ||
2540 | if (dfail > rbio->bbio->max_errors - 1) | ||
2541 | goto cleanup; | ||
2542 | |||
2543 | /* | ||
2544 | * If all data is good, only parity is correctly, just | ||
2545 | * repair the parity. | ||
2546 | */ | ||
2547 | if (dfail == 0) { | ||
2548 | finish_parity_scrub(rbio, 0); | ||
2549 | return; | ||
2550 | } | ||
2551 | |||
2552 | /* | ||
2553 | * Here means we got one corrupted data stripe and one | ||
2554 | * corrupted parity on RAID6, if the corrupted parity | ||
2555 | * is scrubbing parity, luckly, use the other one to repair | ||
2556 | * the data, or we can not repair the data stripe. | ||
2557 | */ | ||
2558 | if (failp != rbio->scrubp) | ||
2559 | goto cleanup; | ||
2560 | |||
2561 | __raid_recover_end_io(rbio); | ||
2562 | } else { | ||
2563 | finish_parity_scrub(rbio, 1); | ||
2564 | } | ||
2565 | return; | ||
2566 | |||
2567 | cleanup: | ||
2568 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2569 | } | ||
2570 | |||
2571 | /* | ||
2572 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
2573 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
2574 | * stripe. | ||
2575 | * | ||
2576 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
2577 | * may trigger parity reconstruction if we had any errors along the way | ||
2578 | */ | ||
2579 | static void raid56_parity_scrub_end_io(struct bio *bio, int err) | ||
2580 | { | ||
2581 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
2582 | |||
2583 | if (err) | ||
2584 | fail_bio_stripe(rbio, bio); | ||
2585 | else | ||
2586 | set_bio_pages_uptodate(bio); | ||
2587 | |||
2588 | bio_put(bio); | ||
2589 | |||
2590 | if (!atomic_dec_and_test(&rbio->stripes_pending)) | ||
2591 | return; | ||
2592 | |||
2593 | /* | ||
2594 | * this will normally call finish_rmw to start our write | ||
2595 | * but if there are any failed stripes we'll reconstruct | ||
2596 | * from parity first | ||
2597 | */ | ||
2598 | validate_rbio_for_parity_scrub(rbio); | ||
2599 | } | ||
2600 | |||
2601 | static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) | ||
2602 | { | ||
2603 | int bios_to_read = 0; | ||
2604 | struct bio_list bio_list; | ||
2605 | int ret; | ||
2606 | int pagenr; | ||
2607 | int stripe; | ||
2608 | struct bio *bio; | ||
2609 | |||
2610 | ret = alloc_rbio_essential_pages(rbio); | ||
2611 | if (ret) | ||
2612 | goto cleanup; | ||
2613 | |||
2614 | bio_list_init(&bio_list); | ||
2615 | |||
2616 | atomic_set(&rbio->error, 0); | ||
2617 | /* | ||
2618 | * build a list of bios to read all the missing parts of this | ||
2619 | * stripe | ||
2620 | */ | ||
2621 | for (stripe = 0; stripe < rbio->real_stripes; stripe++) { | ||
2622 | for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { | ||
2623 | struct page *page; | ||
2624 | /* | ||
2625 | * we want to find all the pages missing from | ||
2626 | * the rbio and read them from the disk. If | ||
2627 | * page_in_rbio finds a page in the bio list | ||
2628 | * we don't need to read it off the stripe. | ||
2629 | */ | ||
2630 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
2631 | if (page) | ||
2632 | continue; | ||
2633 | |||
2634 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
2635 | /* | ||
2636 | * the bio cache may have handed us an uptodate | ||
2637 | * page. If so, be happy and use it | ||
2638 | */ | ||
2639 | if (PageUptodate(page)) | ||
2640 | continue; | ||
2641 | |||
2642 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
2643 | stripe, pagenr, rbio->stripe_len); | ||
2644 | if (ret) | ||
2645 | goto cleanup; | ||
2646 | } | ||
2647 | } | ||
2648 | |||
2649 | bios_to_read = bio_list_size(&bio_list); | ||
2650 | if (!bios_to_read) { | ||
2651 | /* | ||
2652 | * this can happen if others have merged with | ||
2653 | * us, it means there is nothing left to read. | ||
2654 | * But if there are missing devices it may not be | ||
2655 | * safe to do the full stripe write yet. | ||
2656 | */ | ||
2657 | goto finish; | ||
2658 | } | ||
2659 | |||
2660 | /* | ||
2661 | * the bbio may be freed once we submit the last bio. Make sure | ||
2662 | * not to touch it after that | ||
2663 | */ | ||
2664 | atomic_set(&rbio->stripes_pending, bios_to_read); | ||
2665 | while (1) { | ||
2666 | bio = bio_list_pop(&bio_list); | ||
2667 | if (!bio) | ||
2668 | break; | ||
2669 | |||
2670 | bio->bi_private = rbio; | ||
2671 | bio->bi_end_io = raid56_parity_scrub_end_io; | ||
2672 | |||
2673 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
2674 | BTRFS_WQ_ENDIO_RAID56); | ||
2675 | |||
2676 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
2677 | submit_bio(READ, bio); | ||
2678 | } | ||
2679 | /* the actual write will happen once the reads are done */ | ||
2680 | return; | ||
2681 | |||
2682 | cleanup: | ||
2683 | rbio_orig_end_io(rbio, -EIO, 0); | ||
2684 | return; | ||
2685 | |||
2686 | finish: | ||
2687 | validate_rbio_for_parity_scrub(rbio); | ||
2688 | } | ||
2689 | |||
2690 | static void scrub_parity_work(struct btrfs_work *work) | ||
2691 | { | ||
2692 | struct btrfs_raid_bio *rbio; | ||
2693 | |||
2694 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
2695 | raid56_parity_scrub_stripe(rbio); | ||
2696 | } | ||
2697 | |||
2698 | static void async_scrub_parity(struct btrfs_raid_bio *rbio) | ||
2699 | { | ||
2700 | btrfs_init_work(&rbio->work, btrfs_rmw_helper, | ||
2701 | scrub_parity_work, NULL, NULL); | ||
2702 | |||
2703 | btrfs_queue_work(rbio->fs_info->rmw_workers, | ||
2704 | &rbio->work); | ||
2705 | } | ||
2706 | |||
2707 | void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) | ||
2708 | { | ||
2709 | if (!lock_stripe_add(rbio)) | ||
2710 | async_scrub_parity(rbio); | ||
2711 | } | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index ea5d73bfdfbe..31d4a157b5e3 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h | |||
@@ -39,13 +39,25 @@ static inline int nr_data_stripes(struct map_lookup *map) | |||
39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ | 39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ |
40 | ((x) == RAID6_Q_STRIPE)) | 40 | ((x) == RAID6_Q_STRIPE)) |
41 | 41 | ||
42 | struct btrfs_raid_bio; | ||
43 | struct btrfs_device; | ||
44 | |||
42 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | 45 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, |
43 | struct btrfs_bio *bbio, u64 *raid_map, | 46 | struct btrfs_bio *bbio, u64 *raid_map, |
44 | u64 stripe_len, int mirror_num); | 47 | u64 stripe_len, int mirror_num, int generic_io); |
45 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | 48 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, |
46 | struct btrfs_bio *bbio, u64 *raid_map, | 49 | struct btrfs_bio *bbio, u64 *raid_map, |
47 | u64 stripe_len); | 50 | u64 stripe_len); |
48 | 51 | ||
52 | struct btrfs_raid_bio * | ||
53 | raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, | ||
54 | struct btrfs_bio *bbio, u64 *raid_map, | ||
55 | u64 stripe_len, struct btrfs_device *scrub_dev, | ||
56 | unsigned long *dbitmap, int stripe_nsectors); | ||
57 | void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio, | ||
58 | struct page *page, u64 logical); | ||
59 | void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); | ||
60 | |||
49 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); | 61 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); |
50 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); | 62 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); |
51 | #endif | 63 | #endif |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index efa083113827..f2bb13a23f86 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -63,10 +63,18 @@ struct scrub_ctx; | |||
63 | */ | 63 | */ |
64 | #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ | 64 | #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ |
65 | 65 | ||
66 | struct scrub_recover { | ||
67 | atomic_t refs; | ||
68 | struct btrfs_bio *bbio; | ||
69 | u64 *raid_map; | ||
70 | u64 map_length; | ||
71 | }; | ||
72 | |||
66 | struct scrub_page { | 73 | struct scrub_page { |
67 | struct scrub_block *sblock; | 74 | struct scrub_block *sblock; |
68 | struct page *page; | 75 | struct page *page; |
69 | struct btrfs_device *dev; | 76 | struct btrfs_device *dev; |
77 | struct list_head list; | ||
70 | u64 flags; /* extent flags */ | 78 | u64 flags; /* extent flags */ |
71 | u64 generation; | 79 | u64 generation; |
72 | u64 logical; | 80 | u64 logical; |
@@ -79,6 +87,8 @@ struct scrub_page { | |||
79 | unsigned int io_error:1; | 87 | unsigned int io_error:1; |
80 | }; | 88 | }; |
81 | u8 csum[BTRFS_CSUM_SIZE]; | 89 | u8 csum[BTRFS_CSUM_SIZE]; |
90 | |||
91 | struct scrub_recover *recover; | ||
82 | }; | 92 | }; |
83 | 93 | ||
84 | struct scrub_bio { | 94 | struct scrub_bio { |
@@ -105,14 +115,52 @@ struct scrub_block { | |||
105 | atomic_t outstanding_pages; | 115 | atomic_t outstanding_pages; |
106 | atomic_t ref_count; /* free mem on transition to zero */ | 116 | atomic_t ref_count; /* free mem on transition to zero */ |
107 | struct scrub_ctx *sctx; | 117 | struct scrub_ctx *sctx; |
118 | struct scrub_parity *sparity; | ||
108 | struct { | 119 | struct { |
109 | unsigned int header_error:1; | 120 | unsigned int header_error:1; |
110 | unsigned int checksum_error:1; | 121 | unsigned int checksum_error:1; |
111 | unsigned int no_io_error_seen:1; | 122 | unsigned int no_io_error_seen:1; |
112 | unsigned int generation_error:1; /* also sets header_error */ | 123 | unsigned int generation_error:1; /* also sets header_error */ |
124 | |||
125 | /* The following is for the data used to check parity */ | ||
126 | /* It is for the data with checksum */ | ||
127 | unsigned int data_corrected:1; | ||
113 | }; | 128 | }; |
114 | }; | 129 | }; |
115 | 130 | ||
131 | /* Used for the chunks with parity stripe such RAID5/6 */ | ||
132 | struct scrub_parity { | ||
133 | struct scrub_ctx *sctx; | ||
134 | |||
135 | struct btrfs_device *scrub_dev; | ||
136 | |||
137 | u64 logic_start; | ||
138 | |||
139 | u64 logic_end; | ||
140 | |||
141 | int nsectors; | ||
142 | |||
143 | int stripe_len; | ||
144 | |||
145 | atomic_t ref_count; | ||
146 | |||
147 | struct list_head spages; | ||
148 | |||
149 | /* Work of parity check and repair */ | ||
150 | struct btrfs_work work; | ||
151 | |||
152 | /* Mark the parity blocks which have data */ | ||
153 | unsigned long *dbitmap; | ||
154 | |||
155 | /* | ||
156 | * Mark the parity blocks which have data, but errors happen when | ||
157 | * read data or check data | ||
158 | */ | ||
159 | unsigned long *ebitmap; | ||
160 | |||
161 | unsigned long bitmap[0]; | ||
162 | }; | ||
163 | |||
116 | struct scrub_wr_ctx { | 164 | struct scrub_wr_ctx { |
117 | struct scrub_bio *wr_curr_bio; | 165 | struct scrub_bio *wr_curr_bio; |
118 | struct btrfs_device *tgtdev; | 166 | struct btrfs_device *tgtdev; |
@@ -196,7 +244,7 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, | |||
196 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, | 244 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, |
197 | struct scrub_block *sblock, int is_metadata, | 245 | struct scrub_block *sblock, int is_metadata, |
198 | int have_csum, u8 *csum, u64 generation, | 246 | int have_csum, u8 *csum, u64 generation, |
199 | u16 csum_size); | 247 | u16 csum_size, int retry_failed_mirror); |
200 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, | 248 | static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, |
201 | struct scrub_block *sblock, | 249 | struct scrub_block *sblock, |
202 | int is_metadata, int have_csum, | 250 | int is_metadata, int have_csum, |
@@ -218,6 +266,8 @@ static void scrub_block_get(struct scrub_block *sblock); | |||
218 | static void scrub_block_put(struct scrub_block *sblock); | 266 | static void scrub_block_put(struct scrub_block *sblock); |
219 | static void scrub_page_get(struct scrub_page *spage); | 267 | static void scrub_page_get(struct scrub_page *spage); |
220 | static void scrub_page_put(struct scrub_page *spage); | 268 | static void scrub_page_put(struct scrub_page *spage); |
269 | static void scrub_parity_get(struct scrub_parity *sparity); | ||
270 | static void scrub_parity_put(struct scrub_parity *sparity); | ||
221 | static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, | 271 | static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, |
222 | struct scrub_page *spage); | 272 | struct scrub_page *spage); |
223 | static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, | 273 | static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, |
@@ -790,6 +840,20 @@ out: | |||
790 | scrub_pending_trans_workers_dec(sctx); | 840 | scrub_pending_trans_workers_dec(sctx); |
791 | } | 841 | } |
792 | 842 | ||
843 | static inline void scrub_get_recover(struct scrub_recover *recover) | ||
844 | { | ||
845 | atomic_inc(&recover->refs); | ||
846 | } | ||
847 | |||
848 | static inline void scrub_put_recover(struct scrub_recover *recover) | ||
849 | { | ||
850 | if (atomic_dec_and_test(&recover->refs)) { | ||
851 | kfree(recover->bbio); | ||
852 | kfree(recover->raid_map); | ||
853 | kfree(recover); | ||
854 | } | ||
855 | } | ||
856 | |||
793 | /* | 857 | /* |
794 | * scrub_handle_errored_block gets called when either verification of the | 858 | * scrub_handle_errored_block gets called when either verification of the |
795 | * pages failed or the bio failed to read, e.g. with EIO. In the latter | 859 | * pages failed or the bio failed to read, e.g. with EIO. In the latter |
@@ -906,7 +970,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
906 | 970 | ||
907 | /* build and submit the bios for the failed mirror, check checksums */ | 971 | /* build and submit the bios for the failed mirror, check checksums */ |
908 | scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, | 972 | scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum, |
909 | csum, generation, sctx->csum_size); | 973 | csum, generation, sctx->csum_size, 1); |
910 | 974 | ||
911 | if (!sblock_bad->header_error && !sblock_bad->checksum_error && | 975 | if (!sblock_bad->header_error && !sblock_bad->checksum_error && |
912 | sblock_bad->no_io_error_seen) { | 976 | sblock_bad->no_io_error_seen) { |
@@ -920,6 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) | |||
920 | */ | 984 | */ |
921 | spin_lock(&sctx->stat_lock); | 985 | spin_lock(&sctx->stat_lock); |
922 | sctx->stat.unverified_errors++; | 986 | sctx->stat.unverified_errors++; |
987 | sblock_to_check->data_corrected = 1; | ||
923 | spin_unlock(&sctx->stat_lock); | 988 | spin_unlock(&sctx->stat_lock); |
924 | 989 | ||
925 | if (sctx->is_dev_replace) | 990 | if (sctx->is_dev_replace) |
@@ -1019,7 +1084,7 @@ nodatasum_case: | |||
1019 | /* build and submit the bios, check checksums */ | 1084 | /* build and submit the bios, check checksums */ |
1020 | scrub_recheck_block(fs_info, sblock_other, is_metadata, | 1085 | scrub_recheck_block(fs_info, sblock_other, is_metadata, |
1021 | have_csum, csum, generation, | 1086 | have_csum, csum, generation, |
1022 | sctx->csum_size); | 1087 | sctx->csum_size, 0); |
1023 | 1088 | ||
1024 | if (!sblock_other->header_error && | 1089 | if (!sblock_other->header_error && |
1025 | !sblock_other->checksum_error && | 1090 | !sblock_other->checksum_error && |
@@ -1169,7 +1234,7 @@ nodatasum_case: | |||
1169 | */ | 1234 | */ |
1170 | scrub_recheck_block(fs_info, sblock_bad, | 1235 | scrub_recheck_block(fs_info, sblock_bad, |
1171 | is_metadata, have_csum, csum, | 1236 | is_metadata, have_csum, csum, |
1172 | generation, sctx->csum_size); | 1237 | generation, sctx->csum_size, 1); |
1173 | if (!sblock_bad->header_error && | 1238 | if (!sblock_bad->header_error && |
1174 | !sblock_bad->checksum_error && | 1239 | !sblock_bad->checksum_error && |
1175 | sblock_bad->no_io_error_seen) | 1240 | sblock_bad->no_io_error_seen) |
@@ -1180,6 +1245,7 @@ nodatasum_case: | |||
1180 | corrected_error: | 1245 | corrected_error: |
1181 | spin_lock(&sctx->stat_lock); | 1246 | spin_lock(&sctx->stat_lock); |
1182 | sctx->stat.corrected_errors++; | 1247 | sctx->stat.corrected_errors++; |
1248 | sblock_to_check->data_corrected = 1; | ||
1183 | spin_unlock(&sctx->stat_lock); | 1249 | spin_unlock(&sctx->stat_lock); |
1184 | printk_ratelimited_in_rcu(KERN_ERR | 1250 | printk_ratelimited_in_rcu(KERN_ERR |
1185 | "BTRFS: fixed up error at logical %llu on dev %s\n", | 1251 | "BTRFS: fixed up error at logical %llu on dev %s\n", |
@@ -1201,11 +1267,18 @@ out: | |||
1201 | mirror_index++) { | 1267 | mirror_index++) { |
1202 | struct scrub_block *sblock = sblocks_for_recheck + | 1268 | struct scrub_block *sblock = sblocks_for_recheck + |
1203 | mirror_index; | 1269 | mirror_index; |
1270 | struct scrub_recover *recover; | ||
1204 | int page_index; | 1271 | int page_index; |
1205 | 1272 | ||
1206 | for (page_index = 0; page_index < sblock->page_count; | 1273 | for (page_index = 0; page_index < sblock->page_count; |
1207 | page_index++) { | 1274 | page_index++) { |
1208 | sblock->pagev[page_index]->sblock = NULL; | 1275 | sblock->pagev[page_index]->sblock = NULL; |
1276 | recover = sblock->pagev[page_index]->recover; | ||
1277 | if (recover) { | ||
1278 | scrub_put_recover(recover); | ||
1279 | sblock->pagev[page_index]->recover = | ||
1280 | NULL; | ||
1281 | } | ||
1209 | scrub_page_put(sblock->pagev[page_index]); | 1282 | scrub_page_put(sblock->pagev[page_index]); |
1210 | } | 1283 | } |
1211 | } | 1284 | } |
@@ -1215,14 +1288,63 @@ out: | |||
1215 | return 0; | 1288 | return 0; |
1216 | } | 1289 | } |
1217 | 1290 | ||
1291 | static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map) | ||
1292 | { | ||
1293 | if (raid_map) { | ||
1294 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | ||
1295 | return 3; | ||
1296 | else | ||
1297 | return 2; | ||
1298 | } else { | ||
1299 | return (int)bbio->num_stripes; | ||
1300 | } | ||
1301 | } | ||
1302 | |||
1303 | static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map, | ||
1304 | u64 mapped_length, | ||
1305 | int nstripes, int mirror, | ||
1306 | int *stripe_index, | ||
1307 | u64 *stripe_offset) | ||
1308 | { | ||
1309 | int i; | ||
1310 | |||
1311 | if (raid_map) { | ||
1312 | /* RAID5/6 */ | ||
1313 | for (i = 0; i < nstripes; i++) { | ||
1314 | if (raid_map[i] == RAID6_Q_STRIPE || | ||
1315 | raid_map[i] == RAID5_P_STRIPE) | ||
1316 | continue; | ||
1317 | |||
1318 | if (logical >= raid_map[i] && | ||
1319 | logical < raid_map[i] + mapped_length) | ||
1320 | break; | ||
1321 | } | ||
1322 | |||
1323 | *stripe_index = i; | ||
1324 | *stripe_offset = logical - raid_map[i]; | ||
1325 | } else { | ||
1326 | /* The other RAID type */ | ||
1327 | *stripe_index = mirror; | ||
1328 | *stripe_offset = 0; | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1218 | static int scrub_setup_recheck_block(struct scrub_ctx *sctx, | 1332 | static int scrub_setup_recheck_block(struct scrub_ctx *sctx, |
1219 | struct btrfs_fs_info *fs_info, | 1333 | struct btrfs_fs_info *fs_info, |
1220 | struct scrub_block *original_sblock, | 1334 | struct scrub_block *original_sblock, |
1221 | u64 length, u64 logical, | 1335 | u64 length, u64 logical, |
1222 | struct scrub_block *sblocks_for_recheck) | 1336 | struct scrub_block *sblocks_for_recheck) |
1223 | { | 1337 | { |
1338 | struct scrub_recover *recover; | ||
1339 | struct btrfs_bio *bbio; | ||
1340 | u64 *raid_map; | ||
1341 | u64 sublen; | ||
1342 | u64 mapped_length; | ||
1343 | u64 stripe_offset; | ||
1344 | int stripe_index; | ||
1224 | int page_index; | 1345 | int page_index; |
1225 | int mirror_index; | 1346 | int mirror_index; |
1347 | int nmirrors; | ||
1226 | int ret; | 1348 | int ret; |
1227 | 1349 | ||
1228 | /* | 1350 | /* |
@@ -1233,23 +1355,39 @@ static int scrub_setup_recheck_block(struct scrub_ctx *sctx, | |||
1233 | 1355 | ||
1234 | page_index = 0; | 1356 | page_index = 0; |
1235 | while (length > 0) { | 1357 | while (length > 0) { |
1236 | u64 sublen = min_t(u64, length, PAGE_SIZE); | 1358 | sublen = min_t(u64, length, PAGE_SIZE); |
1237 | u64 mapped_length = sublen; | 1359 | mapped_length = sublen; |
1238 | struct btrfs_bio *bbio = NULL; | 1360 | bbio = NULL; |
1361 | raid_map = NULL; | ||
1239 | 1362 | ||
1240 | /* | 1363 | /* |
1241 | * with a length of PAGE_SIZE, each returned stripe | 1364 | * with a length of PAGE_SIZE, each returned stripe |
1242 | * represents one mirror | 1365 | * represents one mirror |
1243 | */ | 1366 | */ |
1244 | ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, | 1367 | ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, |
1245 | &mapped_length, &bbio, 0); | 1368 | &mapped_length, &bbio, 0, &raid_map); |
1246 | if (ret || !bbio || mapped_length < sublen) { | 1369 | if (ret || !bbio || mapped_length < sublen) { |
1247 | kfree(bbio); | 1370 | kfree(bbio); |
1371 | kfree(raid_map); | ||
1248 | return -EIO; | 1372 | return -EIO; |
1249 | } | 1373 | } |
1250 | 1374 | ||
1375 | recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); | ||
1376 | if (!recover) { | ||
1377 | kfree(bbio); | ||
1378 | kfree(raid_map); | ||
1379 | return -ENOMEM; | ||
1380 | } | ||
1381 | |||
1382 | atomic_set(&recover->refs, 1); | ||
1383 | recover->bbio = bbio; | ||
1384 | recover->raid_map = raid_map; | ||
1385 | recover->map_length = mapped_length; | ||
1386 | |||
1251 | BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); | 1387 | BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); |
1252 | for (mirror_index = 0; mirror_index < (int)bbio->num_stripes; | 1388 | |
1389 | nmirrors = scrub_nr_raid_mirrors(bbio, raid_map); | ||
1390 | for (mirror_index = 0; mirror_index < nmirrors; | ||
1253 | mirror_index++) { | 1391 | mirror_index++) { |
1254 | struct scrub_block *sblock; | 1392 | struct scrub_block *sblock; |
1255 | struct scrub_page *page; | 1393 | struct scrub_page *page; |
@@ -1265,26 +1403,38 @@ leave_nomem: | |||
1265 | spin_lock(&sctx->stat_lock); | 1403 | spin_lock(&sctx->stat_lock); |
1266 | sctx->stat.malloc_errors++; | 1404 | sctx->stat.malloc_errors++; |
1267 | spin_unlock(&sctx->stat_lock); | 1405 | spin_unlock(&sctx->stat_lock); |
1268 | kfree(bbio); | 1406 | scrub_put_recover(recover); |
1269 | return -ENOMEM; | 1407 | return -ENOMEM; |
1270 | } | 1408 | } |
1271 | scrub_page_get(page); | 1409 | scrub_page_get(page); |
1272 | sblock->pagev[page_index] = page; | 1410 | sblock->pagev[page_index] = page; |
1273 | page->logical = logical; | 1411 | page->logical = logical; |
1274 | page->physical = bbio->stripes[mirror_index].physical; | 1412 | |
1413 | scrub_stripe_index_and_offset(logical, raid_map, | ||
1414 | mapped_length, | ||
1415 | bbio->num_stripes, | ||
1416 | mirror_index, | ||
1417 | &stripe_index, | ||
1418 | &stripe_offset); | ||
1419 | page->physical = bbio->stripes[stripe_index].physical + | ||
1420 | stripe_offset; | ||
1421 | page->dev = bbio->stripes[stripe_index].dev; | ||
1422 | |||
1275 | BUG_ON(page_index >= original_sblock->page_count); | 1423 | BUG_ON(page_index >= original_sblock->page_count); |
1276 | page->physical_for_dev_replace = | 1424 | page->physical_for_dev_replace = |
1277 | original_sblock->pagev[page_index]-> | 1425 | original_sblock->pagev[page_index]-> |
1278 | physical_for_dev_replace; | 1426 | physical_for_dev_replace; |
1279 | /* for missing devices, dev->bdev is NULL */ | 1427 | /* for missing devices, dev->bdev is NULL */ |
1280 | page->dev = bbio->stripes[mirror_index].dev; | ||
1281 | page->mirror_num = mirror_index + 1; | 1428 | page->mirror_num = mirror_index + 1; |
1282 | sblock->page_count++; | 1429 | sblock->page_count++; |
1283 | page->page = alloc_page(GFP_NOFS); | 1430 | page->page = alloc_page(GFP_NOFS); |
1284 | if (!page->page) | 1431 | if (!page->page) |
1285 | goto leave_nomem; | 1432 | goto leave_nomem; |
1433 | |||
1434 | scrub_get_recover(recover); | ||
1435 | page->recover = recover; | ||
1286 | } | 1436 | } |
1287 | kfree(bbio); | 1437 | scrub_put_recover(recover); |
1288 | length -= sublen; | 1438 | length -= sublen; |
1289 | logical += sublen; | 1439 | logical += sublen; |
1290 | page_index++; | 1440 | page_index++; |
@@ -1293,6 +1443,51 @@ leave_nomem: | |||
1293 | return 0; | 1443 | return 0; |
1294 | } | 1444 | } |
1295 | 1445 | ||
1446 | struct scrub_bio_ret { | ||
1447 | struct completion event; | ||
1448 | int error; | ||
1449 | }; | ||
1450 | |||
1451 | static void scrub_bio_wait_endio(struct bio *bio, int error) | ||
1452 | { | ||
1453 | struct scrub_bio_ret *ret = bio->bi_private; | ||
1454 | |||
1455 | ret->error = error; | ||
1456 | complete(&ret->event); | ||
1457 | } | ||
1458 | |||
1459 | static inline int scrub_is_page_on_raid56(struct scrub_page *page) | ||
1460 | { | ||
1461 | return page->recover && page->recover->raid_map; | ||
1462 | } | ||
1463 | |||
1464 | static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, | ||
1465 | struct bio *bio, | ||
1466 | struct scrub_page *page) | ||
1467 | { | ||
1468 | struct scrub_bio_ret done; | ||
1469 | int ret; | ||
1470 | |||
1471 | init_completion(&done.event); | ||
1472 | done.error = 0; | ||
1473 | bio->bi_iter.bi_sector = page->logical >> 9; | ||
1474 | bio->bi_private = &done; | ||
1475 | bio->bi_end_io = scrub_bio_wait_endio; | ||
1476 | |||
1477 | ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, | ||
1478 | page->recover->raid_map, | ||
1479 | page->recover->map_length, | ||
1480 | page->mirror_num, 0); | ||
1481 | if (ret) | ||
1482 | return ret; | ||
1483 | |||
1484 | wait_for_completion(&done.event); | ||
1485 | if (done.error) | ||
1486 | return -EIO; | ||
1487 | |||
1488 | return 0; | ||
1489 | } | ||
1490 | |||
1296 | /* | 1491 | /* |
1297 | * this function will check the on disk data for checksum errors, header | 1492 | * this function will check the on disk data for checksum errors, header |
1298 | * errors and read I/O errors. If any I/O errors happen, the exact pages | 1493 | * errors and read I/O errors. If any I/O errors happen, the exact pages |
@@ -1303,7 +1498,7 @@ leave_nomem: | |||
1303 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, | 1498 | static void scrub_recheck_block(struct btrfs_fs_info *fs_info, |
1304 | struct scrub_block *sblock, int is_metadata, | 1499 | struct scrub_block *sblock, int is_metadata, |
1305 | int have_csum, u8 *csum, u64 generation, | 1500 | int have_csum, u8 *csum, u64 generation, |
1306 | u16 csum_size) | 1501 | u16 csum_size, int retry_failed_mirror) |
1307 | { | 1502 | { |
1308 | int page_num; | 1503 | int page_num; |
1309 | 1504 | ||
@@ -1329,11 +1524,17 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, | |||
1329 | continue; | 1524 | continue; |
1330 | } | 1525 | } |
1331 | bio->bi_bdev = page->dev->bdev; | 1526 | bio->bi_bdev = page->dev->bdev; |
1332 | bio->bi_iter.bi_sector = page->physical >> 9; | ||
1333 | 1527 | ||
1334 | bio_add_page(bio, page->page, PAGE_SIZE, 0); | 1528 | bio_add_page(bio, page->page, PAGE_SIZE, 0); |
1335 | if (btrfsic_submit_bio_wait(READ, bio)) | 1529 | if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { |
1336 | sblock->no_io_error_seen = 0; | 1530 | if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) |
1531 | sblock->no_io_error_seen = 0; | ||
1532 | } else { | ||
1533 | bio->bi_iter.bi_sector = page->physical >> 9; | ||
1534 | |||
1535 | if (btrfsic_submit_bio_wait(READ, bio)) | ||
1536 | sblock->no_io_error_seen = 0; | ||
1537 | } | ||
1337 | 1538 | ||
1338 | bio_put(bio); | 1539 | bio_put(bio); |
1339 | } | 1540 | } |
@@ -1486,6 +1687,13 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) | |||
1486 | { | 1687 | { |
1487 | int page_num; | 1688 | int page_num; |
1488 | 1689 | ||
1690 | /* | ||
1691 | * This block is used for the check of the parity on the source device, | ||
1692 | * so the data needn't be written into the destination device. | ||
1693 | */ | ||
1694 | if (sblock->sparity) | ||
1695 | return; | ||
1696 | |||
1489 | for (page_num = 0; page_num < sblock->page_count; page_num++) { | 1697 | for (page_num = 0; page_num < sblock->page_count; page_num++) { |
1490 | int ret; | 1698 | int ret; |
1491 | 1699 | ||
@@ -1867,6 +2075,9 @@ static void scrub_block_put(struct scrub_block *sblock) | |||
1867 | if (atomic_dec_and_test(&sblock->ref_count)) { | 2075 | if (atomic_dec_and_test(&sblock->ref_count)) { |
1868 | int i; | 2076 | int i; |
1869 | 2077 | ||
2078 | if (sblock->sparity) | ||
2079 | scrub_parity_put(sblock->sparity); | ||
2080 | |||
1870 | for (i = 0; i < sblock->page_count; i++) | 2081 | for (i = 0; i < sblock->page_count; i++) |
1871 | scrub_page_put(sblock->pagev[i]); | 2082 | scrub_page_put(sblock->pagev[i]); |
1872 | kfree(sblock); | 2083 | kfree(sblock); |
@@ -2124,9 +2335,51 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) | |||
2124 | scrub_pending_bio_dec(sctx); | 2335 | scrub_pending_bio_dec(sctx); |
2125 | } | 2336 | } |
2126 | 2337 | ||
2338 | static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, | ||
2339 | unsigned long *bitmap, | ||
2340 | u64 start, u64 len) | ||
2341 | { | ||
2342 | int offset; | ||
2343 | int nsectors; | ||
2344 | int sectorsize = sparity->sctx->dev_root->sectorsize; | ||
2345 | |||
2346 | if (len >= sparity->stripe_len) { | ||
2347 | bitmap_set(bitmap, 0, sparity->nsectors); | ||
2348 | return; | ||
2349 | } | ||
2350 | |||
2351 | start -= sparity->logic_start; | ||
2352 | offset = (int)do_div(start, sparity->stripe_len); | ||
2353 | offset /= sectorsize; | ||
2354 | nsectors = (int)len / sectorsize; | ||
2355 | |||
2356 | if (offset + nsectors <= sparity->nsectors) { | ||
2357 | bitmap_set(bitmap, offset, nsectors); | ||
2358 | return; | ||
2359 | } | ||
2360 | |||
2361 | bitmap_set(bitmap, offset, sparity->nsectors - offset); | ||
2362 | bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); | ||
2363 | } | ||
2364 | |||
2365 | static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, | ||
2366 | u64 start, u64 len) | ||
2367 | { | ||
2368 | __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); | ||
2369 | } | ||
2370 | |||
2371 | static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, | ||
2372 | u64 start, u64 len) | ||
2373 | { | ||
2374 | __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); | ||
2375 | } | ||
2376 | |||
2127 | static void scrub_block_complete(struct scrub_block *sblock) | 2377 | static void scrub_block_complete(struct scrub_block *sblock) |
2128 | { | 2378 | { |
2379 | int corrupted = 0; | ||
2380 | |||
2129 | if (!sblock->no_io_error_seen) { | 2381 | if (!sblock->no_io_error_seen) { |
2382 | corrupted = 1; | ||
2130 | scrub_handle_errored_block(sblock); | 2383 | scrub_handle_errored_block(sblock); |
2131 | } else { | 2384 | } else { |
2132 | /* | 2385 | /* |
@@ -2134,9 +2387,19 @@ static void scrub_block_complete(struct scrub_block *sblock) | |||
2134 | * dev replace case, otherwise write here in dev replace | 2387 | * dev replace case, otherwise write here in dev replace |
2135 | * case. | 2388 | * case. |
2136 | */ | 2389 | */ |
2137 | if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace) | 2390 | corrupted = scrub_checksum(sblock); |
2391 | if (!corrupted && sblock->sctx->is_dev_replace) | ||
2138 | scrub_write_block_to_dev_replace(sblock); | 2392 | scrub_write_block_to_dev_replace(sblock); |
2139 | } | 2393 | } |
2394 | |||
2395 | if (sblock->sparity && corrupted && !sblock->data_corrected) { | ||
2396 | u64 start = sblock->pagev[0]->logical; | ||
2397 | u64 end = sblock->pagev[sblock->page_count - 1]->logical + | ||
2398 | PAGE_SIZE; | ||
2399 | |||
2400 | scrub_parity_mark_sectors_error(sblock->sparity, | ||
2401 | start, end - start); | ||
2402 | } | ||
2140 | } | 2403 | } |
2141 | 2404 | ||
2142 | static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, | 2405 | static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len, |
@@ -2228,6 +2491,132 @@ behind_scrub_pages: | |||
2228 | return 0; | 2491 | return 0; |
2229 | } | 2492 | } |
2230 | 2493 | ||
2494 | static int scrub_pages_for_parity(struct scrub_parity *sparity, | ||
2495 | u64 logical, u64 len, | ||
2496 | u64 physical, struct btrfs_device *dev, | ||
2497 | u64 flags, u64 gen, int mirror_num, u8 *csum) | ||
2498 | { | ||
2499 | struct scrub_ctx *sctx = sparity->sctx; | ||
2500 | struct scrub_block *sblock; | ||
2501 | int index; | ||
2502 | |||
2503 | sblock = kzalloc(sizeof(*sblock), GFP_NOFS); | ||
2504 | if (!sblock) { | ||
2505 | spin_lock(&sctx->stat_lock); | ||
2506 | sctx->stat.malloc_errors++; | ||
2507 | spin_unlock(&sctx->stat_lock); | ||
2508 | return -ENOMEM; | ||
2509 | } | ||
2510 | |||
2511 | /* one ref inside this function, plus one for each page added to | ||
2512 | * a bio later on */ | ||
2513 | atomic_set(&sblock->ref_count, 1); | ||
2514 | sblock->sctx = sctx; | ||
2515 | sblock->no_io_error_seen = 1; | ||
2516 | sblock->sparity = sparity; | ||
2517 | scrub_parity_get(sparity); | ||
2518 | |||
2519 | for (index = 0; len > 0; index++) { | ||
2520 | struct scrub_page *spage; | ||
2521 | u64 l = min_t(u64, len, PAGE_SIZE); | ||
2522 | |||
2523 | spage = kzalloc(sizeof(*spage), GFP_NOFS); | ||
2524 | if (!spage) { | ||
2525 | leave_nomem: | ||
2526 | spin_lock(&sctx->stat_lock); | ||
2527 | sctx->stat.malloc_errors++; | ||
2528 | spin_unlock(&sctx->stat_lock); | ||
2529 | scrub_block_put(sblock); | ||
2530 | return -ENOMEM; | ||
2531 | } | ||
2532 | BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK); | ||
2533 | /* For scrub block */ | ||
2534 | scrub_page_get(spage); | ||
2535 | sblock->pagev[index] = spage; | ||
2536 | /* For scrub parity */ | ||
2537 | scrub_page_get(spage); | ||
2538 | list_add_tail(&spage->list, &sparity->spages); | ||
2539 | spage->sblock = sblock; | ||
2540 | spage->dev = dev; | ||
2541 | spage->flags = flags; | ||
2542 | spage->generation = gen; | ||
2543 | spage->logical = logical; | ||
2544 | spage->physical = physical; | ||
2545 | spage->mirror_num = mirror_num; | ||
2546 | if (csum) { | ||
2547 | spage->have_csum = 1; | ||
2548 | memcpy(spage->csum, csum, sctx->csum_size); | ||
2549 | } else { | ||
2550 | spage->have_csum = 0; | ||
2551 | } | ||
2552 | sblock->page_count++; | ||
2553 | spage->page = alloc_page(GFP_NOFS); | ||
2554 | if (!spage->page) | ||
2555 | goto leave_nomem; | ||
2556 | len -= l; | ||
2557 | logical += l; | ||
2558 | physical += l; | ||
2559 | } | ||
2560 | |||
2561 | WARN_ON(sblock->page_count == 0); | ||
2562 | for (index = 0; index < sblock->page_count; index++) { | ||
2563 | struct scrub_page *spage = sblock->pagev[index]; | ||
2564 | int ret; | ||
2565 | |||
2566 | ret = scrub_add_page_to_rd_bio(sctx, spage); | ||
2567 | if (ret) { | ||
2568 | scrub_block_put(sblock); | ||
2569 | return ret; | ||
2570 | } | ||
2571 | } | ||
2572 | |||
2573 | /* last one frees, either here or in bio completion for last page */ | ||
2574 | scrub_block_put(sblock); | ||
2575 | return 0; | ||
2576 | } | ||
2577 | |||
2578 | static int scrub_extent_for_parity(struct scrub_parity *sparity, | ||
2579 | u64 logical, u64 len, | ||
2580 | u64 physical, struct btrfs_device *dev, | ||
2581 | u64 flags, u64 gen, int mirror_num) | ||
2582 | { | ||
2583 | struct scrub_ctx *sctx = sparity->sctx; | ||
2584 | int ret; | ||
2585 | u8 csum[BTRFS_CSUM_SIZE]; | ||
2586 | u32 blocksize; | ||
2587 | |||
2588 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | ||
2589 | blocksize = sctx->sectorsize; | ||
2590 | } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
2591 | blocksize = sctx->nodesize; | ||
2592 | } else { | ||
2593 | blocksize = sctx->sectorsize; | ||
2594 | WARN_ON(1); | ||
2595 | } | ||
2596 | |||
2597 | while (len) { | ||
2598 | u64 l = min_t(u64, len, blocksize); | ||
2599 | int have_csum = 0; | ||
2600 | |||
2601 | if (flags & BTRFS_EXTENT_FLAG_DATA) { | ||
2602 | /* push csums to sbio */ | ||
2603 | have_csum = scrub_find_csum(sctx, logical, l, csum); | ||
2604 | if (have_csum == 0) | ||
2605 | goto skip; | ||
2606 | } | ||
2607 | ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, | ||
2608 | flags, gen, mirror_num, | ||
2609 | have_csum ? csum : NULL); | ||
2610 | skip: | ||
2611 | if (ret) | ||
2612 | return ret; | ||
2613 | len -= l; | ||
2614 | logical += l; | ||
2615 | physical += l; | ||
2616 | } | ||
2617 | return 0; | ||
2618 | } | ||
2619 | |||
2231 | /* | 2620 | /* |
2232 | * Given a physical address, this will calculate it's | 2621 | * Given a physical address, this will calculate it's |
2233 | * logical offset. if this is a parity stripe, it will return | 2622 | * logical offset. if this is a parity stripe, it will return |
@@ -2236,7 +2625,8 @@ behind_scrub_pages: | |||
2236 | * return 0 if it is a data stripe, 1 means parity stripe. | 2625 | * return 0 if it is a data stripe, 1 means parity stripe. |
2237 | */ | 2626 | */ |
2238 | static int get_raid56_logic_offset(u64 physical, int num, | 2627 | static int get_raid56_logic_offset(u64 physical, int num, |
2239 | struct map_lookup *map, u64 *offset) | 2628 | struct map_lookup *map, u64 *offset, |
2629 | u64 *stripe_start) | ||
2240 | { | 2630 | { |
2241 | int i; | 2631 | int i; |
2242 | int j = 0; | 2632 | int j = 0; |
@@ -2247,6 +2637,9 @@ static int get_raid56_logic_offset(u64 physical, int num, | |||
2247 | 2637 | ||
2248 | last_offset = (physical - map->stripes[num].physical) * | 2638 | last_offset = (physical - map->stripes[num].physical) * |
2249 | nr_data_stripes(map); | 2639 | nr_data_stripes(map); |
2640 | if (stripe_start) | ||
2641 | *stripe_start = last_offset; | ||
2642 | |||
2250 | *offset = last_offset; | 2643 | *offset = last_offset; |
2251 | for (i = 0; i < nr_data_stripes(map); i++) { | 2644 | for (i = 0; i < nr_data_stripes(map); i++) { |
2252 | *offset = last_offset + i * map->stripe_len; | 2645 | *offset = last_offset + i * map->stripe_len; |
@@ -2269,13 +2662,330 @@ static int get_raid56_logic_offset(u64 physical, int num, | |||
2269 | return 1; | 2662 | return 1; |
2270 | } | 2663 | } |
2271 | 2664 | ||
2665 | static void scrub_free_parity(struct scrub_parity *sparity) | ||
2666 | { | ||
2667 | struct scrub_ctx *sctx = sparity->sctx; | ||
2668 | struct scrub_page *curr, *next; | ||
2669 | int nbits; | ||
2670 | |||
2671 | nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); | ||
2672 | if (nbits) { | ||
2673 | spin_lock(&sctx->stat_lock); | ||
2674 | sctx->stat.read_errors += nbits; | ||
2675 | sctx->stat.uncorrectable_errors += nbits; | ||
2676 | spin_unlock(&sctx->stat_lock); | ||
2677 | } | ||
2678 | |||
2679 | list_for_each_entry_safe(curr, next, &sparity->spages, list) { | ||
2680 | list_del_init(&curr->list); | ||
2681 | scrub_page_put(curr); | ||
2682 | } | ||
2683 | |||
2684 | kfree(sparity); | ||
2685 | } | ||
2686 | |||
2687 | static void scrub_parity_bio_endio(struct bio *bio, int error) | ||
2688 | { | ||
2689 | struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; | ||
2690 | struct scrub_ctx *sctx = sparity->sctx; | ||
2691 | |||
2692 | if (error) | ||
2693 | bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, | ||
2694 | sparity->nsectors); | ||
2695 | |||
2696 | scrub_free_parity(sparity); | ||
2697 | scrub_pending_bio_dec(sctx); | ||
2698 | bio_put(bio); | ||
2699 | } | ||
2700 | |||
2701 | static void scrub_parity_check_and_repair(struct scrub_parity *sparity) | ||
2702 | { | ||
2703 | struct scrub_ctx *sctx = sparity->sctx; | ||
2704 | struct bio *bio; | ||
2705 | struct btrfs_raid_bio *rbio; | ||
2706 | struct scrub_page *spage; | ||
2707 | struct btrfs_bio *bbio = NULL; | ||
2708 | u64 *raid_map = NULL; | ||
2709 | u64 length; | ||
2710 | int ret; | ||
2711 | |||
2712 | if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, | ||
2713 | sparity->nsectors)) | ||
2714 | goto out; | ||
2715 | |||
2716 | length = sparity->logic_end - sparity->logic_start + 1; | ||
2717 | ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, | ||
2718 | sparity->logic_start, | ||
2719 | &length, &bbio, 0, &raid_map); | ||
2720 | if (ret || !bbio || !raid_map) | ||
2721 | goto bbio_out; | ||
2722 | |||
2723 | bio = btrfs_io_bio_alloc(GFP_NOFS, 0); | ||
2724 | if (!bio) | ||
2725 | goto bbio_out; | ||
2726 | |||
2727 | bio->bi_iter.bi_sector = sparity->logic_start >> 9; | ||
2728 | bio->bi_private = sparity; | ||
2729 | bio->bi_end_io = scrub_parity_bio_endio; | ||
2730 | |||
2731 | rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, | ||
2732 | raid_map, length, | ||
2733 | sparity->scrub_dev, | ||
2734 | sparity->dbitmap, | ||
2735 | sparity->nsectors); | ||
2736 | if (!rbio) | ||
2737 | goto rbio_out; | ||
2738 | |||
2739 | list_for_each_entry(spage, &sparity->spages, list) | ||
2740 | raid56_parity_add_scrub_pages(rbio, spage->page, | ||
2741 | spage->logical); | ||
2742 | |||
2743 | scrub_pending_bio_inc(sctx); | ||
2744 | raid56_parity_submit_scrub_rbio(rbio); | ||
2745 | return; | ||
2746 | |||
2747 | rbio_out: | ||
2748 | bio_put(bio); | ||
2749 | bbio_out: | ||
2750 | kfree(bbio); | ||
2751 | kfree(raid_map); | ||
2752 | bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, | ||
2753 | sparity->nsectors); | ||
2754 | spin_lock(&sctx->stat_lock); | ||
2755 | sctx->stat.malloc_errors++; | ||
2756 | spin_unlock(&sctx->stat_lock); | ||
2757 | out: | ||
2758 | scrub_free_parity(sparity); | ||
2759 | } | ||
2760 | |||
2761 | static inline int scrub_calc_parity_bitmap_len(int nsectors) | ||
2762 | { | ||
2763 | return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); | ||
2764 | } | ||
2765 | |||
2766 | static void scrub_parity_get(struct scrub_parity *sparity) | ||
2767 | { | ||
2768 | atomic_inc(&sparity->ref_count); | ||
2769 | } | ||
2770 | |||
2771 | static void scrub_parity_put(struct scrub_parity *sparity) | ||
2772 | { | ||
2773 | if (!atomic_dec_and_test(&sparity->ref_count)) | ||
2774 | return; | ||
2775 | |||
2776 | scrub_parity_check_and_repair(sparity); | ||
2777 | } | ||
2778 | |||
2779 | static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, | ||
2780 | struct map_lookup *map, | ||
2781 | struct btrfs_device *sdev, | ||
2782 | struct btrfs_path *path, | ||
2783 | u64 logic_start, | ||
2784 | u64 logic_end) | ||
2785 | { | ||
2786 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | ||
2787 | struct btrfs_root *root = fs_info->extent_root; | ||
2788 | struct btrfs_root *csum_root = fs_info->csum_root; | ||
2789 | struct btrfs_extent_item *extent; | ||
2790 | u64 flags; | ||
2791 | int ret; | ||
2792 | int slot; | ||
2793 | struct extent_buffer *l; | ||
2794 | struct btrfs_key key; | ||
2795 | u64 generation; | ||
2796 | u64 extent_logical; | ||
2797 | u64 extent_physical; | ||
2798 | u64 extent_len; | ||
2799 | struct btrfs_device *extent_dev; | ||
2800 | struct scrub_parity *sparity; | ||
2801 | int nsectors; | ||
2802 | int bitmap_len; | ||
2803 | int extent_mirror_num; | ||
2804 | int stop_loop = 0; | ||
2805 | |||
2806 | nsectors = map->stripe_len / root->sectorsize; | ||
2807 | bitmap_len = scrub_calc_parity_bitmap_len(nsectors); | ||
2808 | sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, | ||
2809 | GFP_NOFS); | ||
2810 | if (!sparity) { | ||
2811 | spin_lock(&sctx->stat_lock); | ||
2812 | sctx->stat.malloc_errors++; | ||
2813 | spin_unlock(&sctx->stat_lock); | ||
2814 | return -ENOMEM; | ||
2815 | } | ||
2816 | |||
2817 | sparity->stripe_len = map->stripe_len; | ||
2818 | sparity->nsectors = nsectors; | ||
2819 | sparity->sctx = sctx; | ||
2820 | sparity->scrub_dev = sdev; | ||
2821 | sparity->logic_start = logic_start; | ||
2822 | sparity->logic_end = logic_end; | ||
2823 | atomic_set(&sparity->ref_count, 1); | ||
2824 | INIT_LIST_HEAD(&sparity->spages); | ||
2825 | sparity->dbitmap = sparity->bitmap; | ||
2826 | sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; | ||
2827 | |||
2828 | ret = 0; | ||
2829 | while (logic_start < logic_end) { | ||
2830 | if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) | ||
2831 | key.type = BTRFS_METADATA_ITEM_KEY; | ||
2832 | else | ||
2833 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
2834 | key.objectid = logic_start; | ||
2835 | key.offset = (u64)-1; | ||
2836 | |||
2837 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
2838 | if (ret < 0) | ||
2839 | goto out; | ||
2840 | |||
2841 | if (ret > 0) { | ||
2842 | ret = btrfs_previous_extent_item(root, path, 0); | ||
2843 | if (ret < 0) | ||
2844 | goto out; | ||
2845 | if (ret > 0) { | ||
2846 | btrfs_release_path(path); | ||
2847 | ret = btrfs_search_slot(NULL, root, &key, | ||
2848 | path, 0, 0); | ||
2849 | if (ret < 0) | ||
2850 | goto out; | ||
2851 | } | ||
2852 | } | ||
2853 | |||
2854 | stop_loop = 0; | ||
2855 | while (1) { | ||
2856 | u64 bytes; | ||
2857 | |||
2858 | l = path->nodes[0]; | ||
2859 | slot = path->slots[0]; | ||
2860 | if (slot >= btrfs_header_nritems(l)) { | ||
2861 | ret = btrfs_next_leaf(root, path); | ||
2862 | if (ret == 0) | ||
2863 | continue; | ||
2864 | if (ret < 0) | ||
2865 | goto out; | ||
2866 | |||
2867 | stop_loop = 1; | ||
2868 | break; | ||
2869 | } | ||
2870 | btrfs_item_key_to_cpu(l, &key, slot); | ||
2871 | |||
2872 | if (key.type == BTRFS_METADATA_ITEM_KEY) | ||
2873 | bytes = root->nodesize; | ||
2874 | else | ||
2875 | bytes = key.offset; | ||
2876 | |||
2877 | if (key.objectid + bytes <= logic_start) | ||
2878 | goto next; | ||
2879 | |||
2880 | if (key.type != BTRFS_EXTENT_ITEM_KEY && | ||
2881 | key.type != BTRFS_METADATA_ITEM_KEY) | ||
2882 | goto next; | ||
2883 | |||
2884 | if (key.objectid > logic_end) { | ||
2885 | stop_loop = 1; | ||
2886 | break; | ||
2887 | } | ||
2888 | |||
2889 | while (key.objectid >= logic_start + map->stripe_len) | ||
2890 | logic_start += map->stripe_len; | ||
2891 | |||
2892 | extent = btrfs_item_ptr(l, slot, | ||
2893 | struct btrfs_extent_item); | ||
2894 | flags = btrfs_extent_flags(l, extent); | ||
2895 | generation = btrfs_extent_generation(l, extent); | ||
2896 | |||
2897 | if (key.objectid < logic_start && | ||
2898 | (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) { | ||
2899 | btrfs_err(fs_info, | ||
2900 | "scrub: tree block %llu spanning stripes, ignored. logical=%llu", | ||
2901 | key.objectid, logic_start); | ||
2902 | goto next; | ||
2903 | } | ||
2904 | again: | ||
2905 | extent_logical = key.objectid; | ||
2906 | extent_len = bytes; | ||
2907 | |||
2908 | if (extent_logical < logic_start) { | ||
2909 | extent_len -= logic_start - extent_logical; | ||
2910 | extent_logical = logic_start; | ||
2911 | } | ||
2912 | |||
2913 | if (extent_logical + extent_len > | ||
2914 | logic_start + map->stripe_len) | ||
2915 | extent_len = logic_start + map->stripe_len - | ||
2916 | extent_logical; | ||
2917 | |||
2918 | scrub_parity_mark_sectors_data(sparity, extent_logical, | ||
2919 | extent_len); | ||
2920 | |||
2921 | scrub_remap_extent(fs_info, extent_logical, | ||
2922 | extent_len, &extent_physical, | ||
2923 | &extent_dev, | ||
2924 | &extent_mirror_num); | ||
2925 | |||
2926 | ret = btrfs_lookup_csums_range(csum_root, | ||
2927 | extent_logical, | ||
2928 | extent_logical + extent_len - 1, | ||
2929 | &sctx->csum_list, 1); | ||
2930 | if (ret) | ||
2931 | goto out; | ||
2932 | |||
2933 | ret = scrub_extent_for_parity(sparity, extent_logical, | ||
2934 | extent_len, | ||
2935 | extent_physical, | ||
2936 | extent_dev, flags, | ||
2937 | generation, | ||
2938 | extent_mirror_num); | ||
2939 | if (ret) | ||
2940 | goto out; | ||
2941 | |||
2942 | scrub_free_csums(sctx); | ||
2943 | if (extent_logical + extent_len < | ||
2944 | key.objectid + bytes) { | ||
2945 | logic_start += map->stripe_len; | ||
2946 | |||
2947 | if (logic_start >= logic_end) { | ||
2948 | stop_loop = 1; | ||
2949 | break; | ||
2950 | } | ||
2951 | |||
2952 | if (logic_start < key.objectid + bytes) { | ||
2953 | cond_resched(); | ||
2954 | goto again; | ||
2955 | } | ||
2956 | } | ||
2957 | next: | ||
2958 | path->slots[0]++; | ||
2959 | } | ||
2960 | |||
2961 | btrfs_release_path(path); | ||
2962 | |||
2963 | if (stop_loop) | ||
2964 | break; | ||
2965 | |||
2966 | logic_start += map->stripe_len; | ||
2967 | } | ||
2968 | out: | ||
2969 | if (ret < 0) | ||
2970 | scrub_parity_mark_sectors_error(sparity, logic_start, | ||
2971 | logic_end - logic_start + 1); | ||
2972 | scrub_parity_put(sparity); | ||
2973 | scrub_submit(sctx); | ||
2974 | mutex_lock(&sctx->wr_ctx.wr_lock); | ||
2975 | scrub_wr_submit(sctx); | ||
2976 | mutex_unlock(&sctx->wr_ctx.wr_lock); | ||
2977 | |||
2978 | btrfs_release_path(path); | ||
2979 | return ret < 0 ? ret : 0; | ||
2980 | } | ||
2981 | |||
2272 | static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | 2982 | static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, |
2273 | struct map_lookup *map, | 2983 | struct map_lookup *map, |
2274 | struct btrfs_device *scrub_dev, | 2984 | struct btrfs_device *scrub_dev, |
2275 | int num, u64 base, u64 length, | 2985 | int num, u64 base, u64 length, |
2276 | int is_dev_replace) | 2986 | int is_dev_replace) |
2277 | { | 2987 | { |
2278 | struct btrfs_path *path; | 2988 | struct btrfs_path *path, *ppath; |
2279 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; | 2989 | struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; |
2280 | struct btrfs_root *root = fs_info->extent_root; | 2990 | struct btrfs_root *root = fs_info->extent_root; |
2281 | struct btrfs_root *csum_root = fs_info->csum_root; | 2991 | struct btrfs_root *csum_root = fs_info->csum_root; |
@@ -2302,6 +3012,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2302 | u64 extent_logical; | 3012 | u64 extent_logical; |
2303 | u64 extent_physical; | 3013 | u64 extent_physical; |
2304 | u64 extent_len; | 3014 | u64 extent_len; |
3015 | u64 stripe_logical; | ||
3016 | u64 stripe_end; | ||
2305 | struct btrfs_device *extent_dev; | 3017 | struct btrfs_device *extent_dev; |
2306 | int extent_mirror_num; | 3018 | int extent_mirror_num; |
2307 | int stop_loop = 0; | 3019 | int stop_loop = 0; |
@@ -2327,7 +3039,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2327 | mirror_num = num % map->num_stripes + 1; | 3039 | mirror_num = num % map->num_stripes + 1; |
2328 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | 3040 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | |
2329 | BTRFS_BLOCK_GROUP_RAID6)) { | 3041 | BTRFS_BLOCK_GROUP_RAID6)) { |
2330 | get_raid56_logic_offset(physical, num, map, &offset); | 3042 | get_raid56_logic_offset(physical, num, map, &offset, NULL); |
2331 | increment = map->stripe_len * nr_data_stripes(map); | 3043 | increment = map->stripe_len * nr_data_stripes(map); |
2332 | mirror_num = 1; | 3044 | mirror_num = 1; |
2333 | } else { | 3045 | } else { |
@@ -2339,6 +3051,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2339 | if (!path) | 3051 | if (!path) |
2340 | return -ENOMEM; | 3052 | return -ENOMEM; |
2341 | 3053 | ||
3054 | ppath = btrfs_alloc_path(); | ||
3055 | if (!ppath) { | ||
3056 | btrfs_free_path(ppath); | ||
3057 | return -ENOMEM; | ||
3058 | } | ||
3059 | |||
2342 | /* | 3060 | /* |
2343 | * work on commit root. The related disk blocks are static as | 3061 | * work on commit root. The related disk blocks are static as |
2344 | * long as COW is applied. This means, it is save to rewrite | 3062 | * long as COW is applied. This means, it is save to rewrite |
@@ -2357,7 +3075,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2357 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | 3075 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | |
2358 | BTRFS_BLOCK_GROUP_RAID6)) { | 3076 | BTRFS_BLOCK_GROUP_RAID6)) { |
2359 | get_raid56_logic_offset(physical_end, num, | 3077 | get_raid56_logic_offset(physical_end, num, |
2360 | map, &logic_end); | 3078 | map, &logic_end, NULL); |
2361 | logic_end += base; | 3079 | logic_end += base; |
2362 | } else { | 3080 | } else { |
2363 | logic_end = logical + increment * nstripes; | 3081 | logic_end = logical + increment * nstripes; |
@@ -2404,10 +3122,18 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
2404 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | 3122 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | |
2405 | BTRFS_BLOCK_GROUP_RAID6)) { | 3123 | BTRFS_BLOCK_GROUP_RAID6)) { |
2406 | ret = get_raid56_logic_offset(physical, num, | 3124 | ret = get_raid56_logic_offset(physical, num, |
2407 | map, &logical); | 3125 | map, &logical, &stripe_logical); |
2408 | logical += base; | 3126 | logical += base; |
2409 | if (ret) | 3127 | if (ret) { |
3128 | stripe_logical += base; | ||
3129 | stripe_end = stripe_logical + increment - 1; | ||
3130 | ret = scrub_raid56_parity(sctx, map, scrub_dev, | ||
3131 | ppath, stripe_logical, | ||
3132 | stripe_end); | ||
3133 | if (ret) | ||
3134 | goto out; | ||
2410 | goto skip; | 3135 | goto skip; |
3136 | } | ||
2411 | } | 3137 | } |
2412 | /* | 3138 | /* |
2413 | * canceled? | 3139 | * canceled? |
@@ -2558,13 +3284,25 @@ again: | |||
2558 | * loop until we find next data stripe | 3284 | * loop until we find next data stripe |
2559 | * or we have finished all stripes. | 3285 | * or we have finished all stripes. |
2560 | */ | 3286 | */ |
2561 | do { | 3287 | loop: |
2562 | physical += map->stripe_len; | 3288 | physical += map->stripe_len; |
2563 | ret = get_raid56_logic_offset( | 3289 | ret = get_raid56_logic_offset(physical, |
2564 | physical, num, | 3290 | num, map, &logical, |
2565 | map, &logical); | 3291 | &stripe_logical); |
2566 | logical += base; | 3292 | logical += base; |
2567 | } while (physical < physical_end && ret); | 3293 | |
3294 | if (ret && physical < physical_end) { | ||
3295 | stripe_logical += base; | ||
3296 | stripe_end = stripe_logical + | ||
3297 | increment - 1; | ||
3298 | ret = scrub_raid56_parity(sctx, | ||
3299 | map, scrub_dev, ppath, | ||
3300 | stripe_logical, | ||
3301 | stripe_end); | ||
3302 | if (ret) | ||
3303 | goto out; | ||
3304 | goto loop; | ||
3305 | } | ||
2568 | } else { | 3306 | } else { |
2569 | physical += map->stripe_len; | 3307 | physical += map->stripe_len; |
2570 | logical += increment; | 3308 | logical += increment; |
@@ -2605,6 +3343,7 @@ out: | |||
2605 | 3343 | ||
2606 | blk_finish_plug(&plug); | 3344 | blk_finish_plug(&plug); |
2607 | btrfs_free_path(path); | 3345 | btrfs_free_path(path); |
3346 | btrfs_free_path(ppath); | ||
2608 | return ret < 0 ? ret : 0; | 3347 | return ret < 0 ? ret : 0; |
2609 | } | 3348 | } |
2610 | 3349 | ||
@@ -3310,6 +4049,50 @@ out: | |||
3310 | scrub_pending_trans_workers_dec(sctx); | 4049 | scrub_pending_trans_workers_dec(sctx); |
3311 | } | 4050 | } |
3312 | 4051 | ||
4052 | static int check_extent_to_block(struct inode *inode, u64 start, u64 len, | ||
4053 | u64 logical) | ||
4054 | { | ||
4055 | struct extent_state *cached_state = NULL; | ||
4056 | struct btrfs_ordered_extent *ordered; | ||
4057 | struct extent_io_tree *io_tree; | ||
4058 | struct extent_map *em; | ||
4059 | u64 lockstart = start, lockend = start + len - 1; | ||
4060 | int ret = 0; | ||
4061 | |||
4062 | io_tree = &BTRFS_I(inode)->io_tree; | ||
4063 | |||
4064 | lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); | ||
4065 | ordered = btrfs_lookup_ordered_range(inode, lockstart, len); | ||
4066 | if (ordered) { | ||
4067 | btrfs_put_ordered_extent(ordered); | ||
4068 | ret = 1; | ||
4069 | goto out_unlock; | ||
4070 | } | ||
4071 | |||
4072 | em = btrfs_get_extent(inode, NULL, 0, start, len, 0); | ||
4073 | if (IS_ERR(em)) { | ||
4074 | ret = PTR_ERR(em); | ||
4075 | goto out_unlock; | ||
4076 | } | ||
4077 | |||
4078 | /* | ||
4079 | * This extent does not actually cover the logical extent anymore, | ||
4080 | * move on to the next inode. | ||
4081 | */ | ||
4082 | if (em->block_start > logical || | ||
4083 | em->block_start + em->block_len < logical + len) { | ||
4084 | free_extent_map(em); | ||
4085 | ret = 1; | ||
4086 | goto out_unlock; | ||
4087 | } | ||
4088 | free_extent_map(em); | ||
4089 | |||
4090 | out_unlock: | ||
4091 | unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, | ||
4092 | GFP_NOFS); | ||
4093 | return ret; | ||
4094 | } | ||
4095 | |||
3313 | static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, | 4096 | static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, |
3314 | struct scrub_copy_nocow_ctx *nocow_ctx) | 4097 | struct scrub_copy_nocow_ctx *nocow_ctx) |
3315 | { | 4098 | { |
@@ -3318,13 +4101,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, | |||
3318 | struct inode *inode; | 4101 | struct inode *inode; |
3319 | struct page *page; | 4102 | struct page *page; |
3320 | struct btrfs_root *local_root; | 4103 | struct btrfs_root *local_root; |
3321 | struct btrfs_ordered_extent *ordered; | ||
3322 | struct extent_map *em; | ||
3323 | struct extent_state *cached_state = NULL; | ||
3324 | struct extent_io_tree *io_tree; | 4104 | struct extent_io_tree *io_tree; |
3325 | u64 physical_for_dev_replace; | 4105 | u64 physical_for_dev_replace; |
4106 | u64 nocow_ctx_logical; | ||
3326 | u64 len = nocow_ctx->len; | 4107 | u64 len = nocow_ctx->len; |
3327 | u64 lockstart = offset, lockend = offset + len - 1; | ||
3328 | unsigned long index; | 4108 | unsigned long index; |
3329 | int srcu_index; | 4109 | int srcu_index; |
3330 | int ret = 0; | 4110 | int ret = 0; |
@@ -3356,30 +4136,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, | |||
3356 | 4136 | ||
3357 | physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; | 4137 | physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; |
3358 | io_tree = &BTRFS_I(inode)->io_tree; | 4138 | io_tree = &BTRFS_I(inode)->io_tree; |
4139 | nocow_ctx_logical = nocow_ctx->logical; | ||
3359 | 4140 | ||
3360 | lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); | 4141 | ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical); |
3361 | ordered = btrfs_lookup_ordered_range(inode, lockstart, len); | 4142 | if (ret) { |
3362 | if (ordered) { | 4143 | ret = ret > 0 ? 0 : ret; |
3363 | btrfs_put_ordered_extent(ordered); | 4144 | goto out; |
3364 | goto out_unlock; | ||
3365 | } | ||
3366 | |||
3367 | em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0); | ||
3368 | if (IS_ERR(em)) { | ||
3369 | ret = PTR_ERR(em); | ||
3370 | goto out_unlock; | ||
3371 | } | ||
3372 | |||
3373 | /* | ||
3374 | * This extent does not actually cover the logical extent anymore, | ||
3375 | * move on to the next inode. | ||
3376 | */ | ||
3377 | if (em->block_start > nocow_ctx->logical || | ||
3378 | em->block_start + em->block_len < nocow_ctx->logical + len) { | ||
3379 | free_extent_map(em); | ||
3380 | goto out_unlock; | ||
3381 | } | 4145 | } |
3382 | free_extent_map(em); | ||
3383 | 4146 | ||
3384 | while (len >= PAGE_CACHE_SIZE) { | 4147 | while (len >= PAGE_CACHE_SIZE) { |
3385 | index = offset >> PAGE_CACHE_SHIFT; | 4148 | index = offset >> PAGE_CACHE_SHIFT; |
@@ -3396,7 +4159,7 @@ again: | |||
3396 | goto next_page; | 4159 | goto next_page; |
3397 | } else { | 4160 | } else { |
3398 | ClearPageError(page); | 4161 | ClearPageError(page); |
3399 | err = extent_read_full_page_nolock(io_tree, page, | 4162 | err = extent_read_full_page(io_tree, page, |
3400 | btrfs_get_extent, | 4163 | btrfs_get_extent, |
3401 | nocow_ctx->mirror_num); | 4164 | nocow_ctx->mirror_num); |
3402 | if (err) { | 4165 | if (err) { |
@@ -3421,6 +4184,14 @@ again: | |||
3421 | goto next_page; | 4184 | goto next_page; |
3422 | } | 4185 | } |
3423 | } | 4186 | } |
4187 | |||
4188 | ret = check_extent_to_block(inode, offset, len, | ||
4189 | nocow_ctx_logical); | ||
4190 | if (ret) { | ||
4191 | ret = ret > 0 ? 0 : ret; | ||
4192 | goto next_page; | ||
4193 | } | ||
4194 | |||
3424 | err = write_page_nocow(nocow_ctx->sctx, | 4195 | err = write_page_nocow(nocow_ctx->sctx, |
3425 | physical_for_dev_replace, page); | 4196 | physical_for_dev_replace, page); |
3426 | if (err) | 4197 | if (err) |
@@ -3434,12 +4205,10 @@ next_page: | |||
3434 | 4205 | ||
3435 | offset += PAGE_CACHE_SIZE; | 4206 | offset += PAGE_CACHE_SIZE; |
3436 | physical_for_dev_replace += PAGE_CACHE_SIZE; | 4207 | physical_for_dev_replace += PAGE_CACHE_SIZE; |
4208 | nocow_ctx_logical += PAGE_CACHE_SIZE; | ||
3437 | len -= PAGE_CACHE_SIZE; | 4209 | len -= PAGE_CACHE_SIZE; |
3438 | } | 4210 | } |
3439 | ret = COPY_COMPLETE; | 4211 | ret = COPY_COMPLETE; |
3440 | out_unlock: | ||
3441 | unlock_extent_cached(io_tree, lockstart, lockend, &cached_state, | ||
3442 | GFP_NOFS); | ||
3443 | out: | 4212 | out: |
3444 | mutex_unlock(&inode->i_mutex); | 4213 | mutex_unlock(&inode->i_mutex); |
3445 | iput(inode); | 4214 | iput(inode); |
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 874828dd0a86..804432dbc351 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c | |||
@@ -5507,6 +5507,51 @@ out: | |||
5507 | return ret; | 5507 | return ret; |
5508 | } | 5508 | } |
5509 | 5509 | ||
5510 | /* | ||
5511 | * If orphan cleanup did remove any orphans from a root, it means the tree | ||
5512 | * was modified and therefore the commit root is not the same as the current | ||
5513 | * root anymore. This is a problem, because send uses the commit root and | ||
5514 | * therefore can see inode items that don't exist in the current root anymore, | ||
5515 | * and for example make calls to btrfs_iget, which will do tree lookups based | ||
5516 | * on the current root and not on the commit root. Those lookups will fail, | ||
5517 | * returning a -ESTALE error, and making send fail with that error. So make | ||
5518 | * sure a send does not see any orphans we have just removed, and that it will | ||
5519 | * see the same inodes regardless of whether a transaction commit happened | ||
5520 | * before it started (meaning that the commit root will be the same as the | ||
5521 | * current root) or not. | ||
5522 | */ | ||
5523 | static int ensure_commit_roots_uptodate(struct send_ctx *sctx) | ||
5524 | { | ||
5525 | int i; | ||
5526 | struct btrfs_trans_handle *trans = NULL; | ||
5527 | |||
5528 | again: | ||
5529 | if (sctx->parent_root && | ||
5530 | sctx->parent_root->node != sctx->parent_root->commit_root) | ||
5531 | goto commit_trans; | ||
5532 | |||
5533 | for (i = 0; i < sctx->clone_roots_cnt; i++) | ||
5534 | if (sctx->clone_roots[i].root->node != | ||
5535 | sctx->clone_roots[i].root->commit_root) | ||
5536 | goto commit_trans; | ||
5537 | |||
5538 | if (trans) | ||
5539 | return btrfs_end_transaction(trans, sctx->send_root); | ||
5540 | |||
5541 | return 0; | ||
5542 | |||
5543 | commit_trans: | ||
5544 | /* Use any root, all fs roots will get their commit roots updated. */ | ||
5545 | if (!trans) { | ||
5546 | trans = btrfs_join_transaction(sctx->send_root); | ||
5547 | if (IS_ERR(trans)) | ||
5548 | return PTR_ERR(trans); | ||
5549 | goto again; | ||
5550 | } | ||
5551 | |||
5552 | return btrfs_commit_transaction(trans, sctx->send_root); | ||
5553 | } | ||
5554 | |||
5510 | static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) | 5555 | static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) |
5511 | { | 5556 | { |
5512 | spin_lock(&root->root_item_lock); | 5557 | spin_lock(&root->root_item_lock); |
@@ -5728,6 +5773,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
5728 | NULL); | 5773 | NULL); |
5729 | sort_clone_roots = 1; | 5774 | sort_clone_roots = 1; |
5730 | 5775 | ||
5776 | ret = ensure_commit_roots_uptodate(sctx); | ||
5777 | if (ret) | ||
5778 | goto out; | ||
5779 | |||
5731 | current->journal_info = BTRFS_SEND_TRANS_STUB; | 5780 | current->journal_info = BTRFS_SEND_TRANS_STUB; |
5732 | ret = send_subvol(sctx); | 5781 | ret = send_subvol(sctx); |
5733 | current->journal_info = NULL; | 5782 | current->journal_info = NULL; |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 54bd91ece35b..60f7cbe815e9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -262,7 +262,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, | |||
262 | trans->aborted = errno; | 262 | trans->aborted = errno; |
263 | /* Nothing used. The other threads that have joined this | 263 | /* Nothing used. The other threads that have joined this |
264 | * transaction may be able to continue. */ | 264 | * transaction may be able to continue. */ |
265 | if (!trans->blocks_used) { | 265 | if (!trans->blocks_used && list_empty(&trans->new_bgs)) { |
266 | const char *errstr; | 266 | const char *errstr; |
267 | 267 | ||
268 | errstr = btrfs_decode_error(errno); | 268 | errstr = btrfs_decode_error(errno); |
@@ -642,11 +642,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
642 | "disabling disk space caching"); | 642 | "disabling disk space caching"); |
643 | break; | 643 | break; |
644 | case Opt_inode_cache: | 644 | case Opt_inode_cache: |
645 | btrfs_set_and_info(root, CHANGE_INODE_CACHE, | 645 | btrfs_set_pending_and_info(info, INODE_MAP_CACHE, |
646 | "enabling inode map caching"); | 646 | "enabling inode map caching"); |
647 | break; | 647 | break; |
648 | case Opt_noinode_cache: | 648 | case Opt_noinode_cache: |
649 | btrfs_clear_and_info(root, CHANGE_INODE_CACHE, | 649 | btrfs_clear_pending_and_info(info, INODE_MAP_CACHE, |
650 | "disabling inode map caching"); | 650 | "disabling inode map caching"); |
651 | break; | 651 | break; |
652 | case Opt_clear_cache: | 652 | case Opt_clear_cache: |
@@ -993,9 +993,17 @@ int btrfs_sync_fs(struct super_block *sb, int wait) | |||
993 | trans = btrfs_attach_transaction_barrier(root); | 993 | trans = btrfs_attach_transaction_barrier(root); |
994 | if (IS_ERR(trans)) { | 994 | if (IS_ERR(trans)) { |
995 | /* no transaction, don't bother */ | 995 | /* no transaction, don't bother */ |
996 | if (PTR_ERR(trans) == -ENOENT) | 996 | if (PTR_ERR(trans) == -ENOENT) { |
997 | return 0; | 997 | /* |
998 | return PTR_ERR(trans); | 998 | * Exit unless we have some pending changes |
999 | * that need to go through commit | ||
1000 | */ | ||
1001 | if (fs_info->pending_changes == 0) | ||
1002 | return 0; | ||
1003 | trans = btrfs_start_transaction(root, 0); | ||
1004 | } else { | ||
1005 | return PTR_ERR(trans); | ||
1006 | } | ||
999 | } | 1007 | } |
1000 | return btrfs_commit_transaction(trans, root); | 1008 | return btrfs_commit_transaction(trans, root); |
1001 | } | 1009 | } |
@@ -1644,8 +1652,20 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1644 | int i = 0, nr_devices; | 1652 | int i = 0, nr_devices; |
1645 | int ret; | 1653 | int ret; |
1646 | 1654 | ||
1655 | /* | ||
1656 | * We aren't under the device list lock, so this is racey-ish, but good | ||
1657 | * enough for our purposes. | ||
1658 | */ | ||
1647 | nr_devices = fs_info->fs_devices->open_devices; | 1659 | nr_devices = fs_info->fs_devices->open_devices; |
1648 | BUG_ON(!nr_devices); | 1660 | if (!nr_devices) { |
1661 | smp_mb(); | ||
1662 | nr_devices = fs_info->fs_devices->open_devices; | ||
1663 | ASSERT(nr_devices); | ||
1664 | if (!nr_devices) { | ||
1665 | *free_bytes = 0; | ||
1666 | return 0; | ||
1667 | } | ||
1668 | } | ||
1649 | 1669 | ||
1650 | devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), | 1670 | devices_info = kmalloc_array(nr_devices, sizeof(*devices_info), |
1651 | GFP_NOFS); | 1671 | GFP_NOFS); |
@@ -1670,11 +1690,17 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1670 | else | 1690 | else |
1671 | min_stripe_size = BTRFS_STRIPE_LEN; | 1691 | min_stripe_size = BTRFS_STRIPE_LEN; |
1672 | 1692 | ||
1673 | list_for_each_entry(device, &fs_devices->devices, dev_list) { | 1693 | if (fs_info->alloc_start) |
1694 | mutex_lock(&fs_devices->device_list_mutex); | ||
1695 | rcu_read_lock(); | ||
1696 | list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { | ||
1674 | if (!device->in_fs_metadata || !device->bdev || | 1697 | if (!device->in_fs_metadata || !device->bdev || |
1675 | device->is_tgtdev_for_dev_replace) | 1698 | device->is_tgtdev_for_dev_replace) |
1676 | continue; | 1699 | continue; |
1677 | 1700 | ||
1701 | if (i >= nr_devices) | ||
1702 | break; | ||
1703 | |||
1678 | avail_space = device->total_bytes - device->bytes_used; | 1704 | avail_space = device->total_bytes - device->bytes_used; |
1679 | 1705 | ||
1680 | /* align with stripe_len */ | 1706 | /* align with stripe_len */ |
@@ -1689,24 +1715,32 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1689 | skip_space = 1024 * 1024; | 1715 | skip_space = 1024 * 1024; |
1690 | 1716 | ||
1691 | /* user can set the offset in fs_info->alloc_start. */ | 1717 | /* user can set the offset in fs_info->alloc_start. */ |
1692 | if (fs_info->alloc_start + BTRFS_STRIPE_LEN <= | 1718 | if (fs_info->alloc_start && |
1693 | device->total_bytes) | 1719 | fs_info->alloc_start + BTRFS_STRIPE_LEN <= |
1720 | device->total_bytes) { | ||
1721 | rcu_read_unlock(); | ||
1694 | skip_space = max(fs_info->alloc_start, skip_space); | 1722 | skip_space = max(fs_info->alloc_start, skip_space); |
1695 | 1723 | ||
1696 | /* | 1724 | /* |
1697 | * btrfs can not use the free space in [0, skip_space - 1], | 1725 | * btrfs can not use the free space in |
1698 | * we must subtract it from the total. In order to implement | 1726 | * [0, skip_space - 1], we must subtract it from the |
1699 | * it, we account the used space in this range first. | 1727 | * total. In order to implement it, we account the used |
1700 | */ | 1728 | * space in this range first. |
1701 | ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1, | 1729 | */ |
1702 | &used_space); | 1730 | ret = btrfs_account_dev_extents_size(device, 0, |
1703 | if (ret) { | 1731 | skip_space - 1, |
1704 | kfree(devices_info); | 1732 | &used_space); |
1705 | return ret; | 1733 | if (ret) { |
1706 | } | 1734 | kfree(devices_info); |
1735 | mutex_unlock(&fs_devices->device_list_mutex); | ||
1736 | return ret; | ||
1737 | } | ||
1707 | 1738 | ||
1708 | /* calc the free space in [0, skip_space - 1] */ | 1739 | rcu_read_lock(); |
1709 | skip_space -= used_space; | 1740 | |
1741 | /* calc the free space in [0, skip_space - 1] */ | ||
1742 | skip_space -= used_space; | ||
1743 | } | ||
1710 | 1744 | ||
1711 | /* | 1745 | /* |
1712 | * we can use the free space in [0, skip_space - 1], subtract | 1746 | * we can use the free space in [0, skip_space - 1], subtract |
@@ -1725,6 +1759,9 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) | |||
1725 | 1759 | ||
1726 | i++; | 1760 | i++; |
1727 | } | 1761 | } |
1762 | rcu_read_unlock(); | ||
1763 | if (fs_info->alloc_start) | ||
1764 | mutex_unlock(&fs_devices->device_list_mutex); | ||
1728 | 1765 | ||
1729 | nr_devices = i; | 1766 | nr_devices = i; |
1730 | 1767 | ||
@@ -1787,8 +1824,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1787 | * holding chunk_muext to avoid allocating new chunks, holding | 1824 | * holding chunk_muext to avoid allocating new chunks, holding |
1788 | * device_list_mutex to avoid the device being removed | 1825 | * device_list_mutex to avoid the device being removed |
1789 | */ | 1826 | */ |
1790 | mutex_lock(&fs_info->fs_devices->device_list_mutex); | ||
1791 | mutex_lock(&fs_info->chunk_mutex); | ||
1792 | rcu_read_lock(); | 1827 | rcu_read_lock(); |
1793 | list_for_each_entry_rcu(found, head, list) { | 1828 | list_for_each_entry_rcu(found, head, list) { |
1794 | if (found->flags & BTRFS_BLOCK_GROUP_DATA) { | 1829 | if (found->flags & BTRFS_BLOCK_GROUP_DATA) { |
@@ -1824,17 +1859,12 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1824 | buf->f_bfree -= block_rsv->size >> bits; | 1859 | buf->f_bfree -= block_rsv->size >> bits; |
1825 | spin_unlock(&block_rsv->lock); | 1860 | spin_unlock(&block_rsv->lock); |
1826 | 1861 | ||
1827 | buf->f_bavail = total_free_data; | 1862 | buf->f_bavail = div_u64(total_free_data, factor); |
1828 | ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); | 1863 | ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); |
1829 | if (ret) { | 1864 | if (ret) |
1830 | mutex_unlock(&fs_info->chunk_mutex); | ||
1831 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); | ||
1832 | return ret; | 1865 | return ret; |
1833 | } | ||
1834 | buf->f_bavail += div_u64(total_free_data, factor); | 1866 | buf->f_bavail += div_u64(total_free_data, factor); |
1835 | buf->f_bavail = buf->f_bavail >> bits; | 1867 | buf->f_bavail = buf->f_bavail >> bits; |
1836 | mutex_unlock(&fs_info->chunk_mutex); | ||
1837 | mutex_unlock(&fs_info->fs_devices->device_list_mutex); | ||
1838 | 1868 | ||
1839 | buf->f_type = BTRFS_SUPER_MAGIC; | 1869 | buf->f_type = BTRFS_SUPER_MAGIC; |
1840 | buf->f_bsize = dentry->d_sb->s_blocksize; | 1870 | buf->f_bsize = dentry->d_sb->s_blocksize; |
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b2e7bb4393f6..92db3f648df4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c | |||
@@ -111,7 +111,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, | |||
111 | { | 111 | { |
112 | struct btrfs_fs_info *fs_info; | 112 | struct btrfs_fs_info *fs_info; |
113 | struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); | 113 | struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a); |
114 | struct btrfs_trans_handle *trans; | ||
115 | u64 features, set, clear; | 114 | u64 features, set, clear; |
116 | unsigned long val; | 115 | unsigned long val; |
117 | int ret; | 116 | int ret; |
@@ -153,10 +152,6 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, | |||
153 | btrfs_info(fs_info, "%s %s feature flag", | 152 | btrfs_info(fs_info, "%s %s feature flag", |
154 | val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); | 153 | val ? "Setting" : "Clearing", fa->kobj_attr.attr.name); |
155 | 154 | ||
156 | trans = btrfs_start_transaction(fs_info->fs_root, 0); | ||
157 | if (IS_ERR(trans)) | ||
158 | return PTR_ERR(trans); | ||
159 | |||
160 | spin_lock(&fs_info->super_lock); | 155 | spin_lock(&fs_info->super_lock); |
161 | features = get_features(fs_info, fa->feature_set); | 156 | features = get_features(fs_info, fa->feature_set); |
162 | if (val) | 157 | if (val) |
@@ -166,9 +161,11 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, | |||
166 | set_features(fs_info, fa->feature_set, features); | 161 | set_features(fs_info, fa->feature_set, features); |
167 | spin_unlock(&fs_info->super_lock); | 162 | spin_unlock(&fs_info->super_lock); |
168 | 163 | ||
169 | ret = btrfs_commit_transaction(trans, fs_info->fs_root); | 164 | /* |
170 | if (ret) | 165 | * We don't want to do full transaction commit from inside sysfs |
171 | return ret; | 166 | */ |
167 | btrfs_set_pending(fs_info, COMMIT); | ||
168 | wake_up_process(fs_info->transaction_kthread); | ||
172 | 169 | ||
173 | return count; | 170 | return count; |
174 | } | 171 | } |
@@ -372,9 +369,6 @@ static ssize_t btrfs_label_store(struct kobject *kobj, | |||
372 | const char *buf, size_t len) | 369 | const char *buf, size_t len) |
373 | { | 370 | { |
374 | struct btrfs_fs_info *fs_info = to_fs_info(kobj); | 371 | struct btrfs_fs_info *fs_info = to_fs_info(kobj); |
375 | struct btrfs_trans_handle *trans; | ||
376 | struct btrfs_root *root = fs_info->fs_root; | ||
377 | int ret; | ||
378 | size_t p_len; | 372 | size_t p_len; |
379 | 373 | ||
380 | if (fs_info->sb->s_flags & MS_RDONLY) | 374 | if (fs_info->sb->s_flags & MS_RDONLY) |
@@ -389,20 +383,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, | |||
389 | if (p_len >= BTRFS_LABEL_SIZE) | 383 | if (p_len >= BTRFS_LABEL_SIZE) |
390 | return -EINVAL; | 384 | return -EINVAL; |
391 | 385 | ||
392 | trans = btrfs_start_transaction(root, 0); | 386 | spin_lock(&fs_info->super_lock); |
393 | if (IS_ERR(trans)) | ||
394 | return PTR_ERR(trans); | ||
395 | |||
396 | spin_lock(&root->fs_info->super_lock); | ||
397 | memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); | 387 | memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); |
398 | memcpy(fs_info->super_copy->label, buf, p_len); | 388 | memcpy(fs_info->super_copy->label, buf, p_len); |
399 | spin_unlock(&root->fs_info->super_lock); | 389 | spin_unlock(&fs_info->super_lock); |
400 | ret = btrfs_commit_transaction(trans, root); | ||
401 | 390 | ||
402 | if (!ret) | 391 | /* |
403 | return len; | 392 | * We don't want to do full transaction commit from inside sysfs |
393 | */ | ||
394 | btrfs_set_pending(fs_info, COMMIT); | ||
395 | wake_up_process(fs_info->transaction_kthread); | ||
404 | 396 | ||
405 | return ret; | 397 | return len; |
406 | } | 398 | } |
407 | BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); | 399 | BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); |
408 | 400 | ||
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae3616728..a605d4e2f2bc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -76,6 +76,32 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) | |||
76 | } | 76 | } |
77 | } | 77 | } |
78 | 78 | ||
79 | static void clear_btree_io_tree(struct extent_io_tree *tree) | ||
80 | { | ||
81 | spin_lock(&tree->lock); | ||
82 | while (!RB_EMPTY_ROOT(&tree->state)) { | ||
83 | struct rb_node *node; | ||
84 | struct extent_state *state; | ||
85 | |||
86 | node = rb_first(&tree->state); | ||
87 | state = rb_entry(node, struct extent_state, rb_node); | ||
88 | rb_erase(&state->rb_node, &tree->state); | ||
89 | RB_CLEAR_NODE(&state->rb_node); | ||
90 | /* | ||
91 | * btree io trees aren't supposed to have tasks waiting for | ||
92 | * changes in the flags of extent states ever. | ||
93 | */ | ||
94 | ASSERT(!waitqueue_active(&state->wq)); | ||
95 | free_extent_state(state); | ||
96 | if (need_resched()) { | ||
97 | spin_unlock(&tree->lock); | ||
98 | cond_resched(); | ||
99 | spin_lock(&tree->lock); | ||
100 | } | ||
101 | } | ||
102 | spin_unlock(&tree->lock); | ||
103 | } | ||
104 | |||
79 | static noinline void switch_commit_roots(struct btrfs_transaction *trans, | 105 | static noinline void switch_commit_roots(struct btrfs_transaction *trans, |
80 | struct btrfs_fs_info *fs_info) | 106 | struct btrfs_fs_info *fs_info) |
81 | { | 107 | { |
@@ -89,6 +115,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans, | |||
89 | root->commit_root = btrfs_root_node(root); | 115 | root->commit_root = btrfs_root_node(root); |
90 | if (is_fstree(root->objectid)) | 116 | if (is_fstree(root->objectid)) |
91 | btrfs_unpin_free_ino(root); | 117 | btrfs_unpin_free_ino(root); |
118 | clear_btree_io_tree(&root->dirty_log_pages); | ||
92 | } | 119 | } |
93 | up_write(&fs_info->commit_root_sem); | 120 | up_write(&fs_info->commit_root_sem); |
94 | } | 121 | } |
@@ -220,6 +247,7 @@ loop: | |||
220 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | 247 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); |
221 | INIT_LIST_HEAD(&cur_trans->pending_chunks); | 248 | INIT_LIST_HEAD(&cur_trans->pending_chunks); |
222 | INIT_LIST_HEAD(&cur_trans->switch_commits); | 249 | INIT_LIST_HEAD(&cur_trans->switch_commits); |
250 | INIT_LIST_HEAD(&cur_trans->pending_ordered); | ||
223 | list_add_tail(&cur_trans->list, &fs_info->trans_list); | 251 | list_add_tail(&cur_trans->list, &fs_info->trans_list); |
224 | extent_io_tree_init(&cur_trans->dirty_pages, | 252 | extent_io_tree_init(&cur_trans->dirty_pages, |
225 | fs_info->btree_inode->i_mapping); | 253 | fs_info->btree_inode->i_mapping); |
@@ -488,6 +516,7 @@ again: | |||
488 | h->sync = false; | 516 | h->sync = false; |
489 | INIT_LIST_HEAD(&h->qgroup_ref_list); | 517 | INIT_LIST_HEAD(&h->qgroup_ref_list); |
490 | INIT_LIST_HEAD(&h->new_bgs); | 518 | INIT_LIST_HEAD(&h->new_bgs); |
519 | INIT_LIST_HEAD(&h->ordered); | ||
491 | 520 | ||
492 | smp_mb(); | 521 | smp_mb(); |
493 | if (cur_trans->state >= TRANS_STATE_BLOCKED && | 522 | if (cur_trans->state >= TRANS_STATE_BLOCKED && |
@@ -719,6 +748,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
719 | if (!list_empty(&trans->new_bgs)) | 748 | if (!list_empty(&trans->new_bgs)) |
720 | btrfs_create_pending_block_groups(trans, root); | 749 | btrfs_create_pending_block_groups(trans, root); |
721 | 750 | ||
751 | if (!list_empty(&trans->ordered)) { | ||
752 | spin_lock(&info->trans_lock); | ||
753 | list_splice(&trans->ordered, &cur_trans->pending_ordered); | ||
754 | spin_unlock(&info->trans_lock); | ||
755 | } | ||
756 | |||
722 | trans->delayed_ref_updates = 0; | 757 | trans->delayed_ref_updates = 0; |
723 | if (!trans->sync) { | 758 | if (!trans->sync) { |
724 | must_run_delayed_refs = | 759 | must_run_delayed_refs = |
@@ -828,17 +863,39 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
828 | 863 | ||
829 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 864 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
830 | mark, &cached_state)) { | 865 | mark, &cached_state)) { |
831 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 866 | bool wait_writeback = false; |
832 | mark, &cached_state, GFP_NOFS); | 867 | |
833 | cached_state = NULL; | 868 | err = convert_extent_bit(dirty_pages, start, end, |
834 | err = filemap_fdatawrite_range(mapping, start, end); | 869 | EXTENT_NEED_WAIT, |
870 | mark, &cached_state, GFP_NOFS); | ||
871 | /* | ||
872 | * convert_extent_bit can return -ENOMEM, which is most of the | ||
873 | * time a temporary error. So when it happens, ignore the error | ||
874 | * and wait for writeback of this range to finish - because we | ||
875 | * failed to set the bit EXTENT_NEED_WAIT for the range, a call | ||
876 | * to btrfs_wait_marked_extents() would not know that writeback | ||
877 | * for this range started and therefore wouldn't wait for it to | ||
878 | * finish - we don't want to commit a superblock that points to | ||
879 | * btree nodes/leafs for which writeback hasn't finished yet | ||
880 | * (and without errors). | ||
881 | * We cleanup any entries left in the io tree when committing | ||
882 | * the transaction (through clear_btree_io_tree()). | ||
883 | */ | ||
884 | if (err == -ENOMEM) { | ||
885 | err = 0; | ||
886 | wait_writeback = true; | ||
887 | } | ||
888 | if (!err) | ||
889 | err = filemap_fdatawrite_range(mapping, start, end); | ||
835 | if (err) | 890 | if (err) |
836 | werr = err; | 891 | werr = err; |
892 | else if (wait_writeback) | ||
893 | werr = filemap_fdatawait_range(mapping, start, end); | ||
894 | free_extent_state(cached_state); | ||
895 | cached_state = NULL; | ||
837 | cond_resched(); | 896 | cond_resched(); |
838 | start = end + 1; | 897 | start = end + 1; |
839 | } | 898 | } |
840 | if (err) | ||
841 | werr = err; | ||
842 | return werr; | 899 | return werr; |
843 | } | 900 | } |
844 | 901 | ||
@@ -862,11 +919,25 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, | |||
862 | 919 | ||
863 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 920 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
864 | EXTENT_NEED_WAIT, &cached_state)) { | 921 | EXTENT_NEED_WAIT, &cached_state)) { |
865 | clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 922 | /* |
866 | 0, 0, &cached_state, GFP_NOFS); | 923 | * Ignore -ENOMEM errors returned by clear_extent_bit(). |
867 | err = filemap_fdatawait_range(mapping, start, end); | 924 | * When committing the transaction, we'll remove any entries |
925 | * left in the io tree. For a log commit, we don't remove them | ||
926 | * after committing the log because the tree can be accessed | ||
927 | * concurrently - we do it only at transaction commit time when | ||
928 | * it's safe to do it (through clear_btree_io_tree()). | ||
929 | */ | ||
930 | err = clear_extent_bit(dirty_pages, start, end, | ||
931 | EXTENT_NEED_WAIT, | ||
932 | 0, 0, &cached_state, GFP_NOFS); | ||
933 | if (err == -ENOMEM) | ||
934 | err = 0; | ||
935 | if (!err) | ||
936 | err = filemap_fdatawait_range(mapping, start, end); | ||
868 | if (err) | 937 | if (err) |
869 | werr = err; | 938 | werr = err; |
939 | free_extent_state(cached_state); | ||
940 | cached_state = NULL; | ||
870 | cond_resched(); | 941 | cond_resched(); |
871 | start = end + 1; | 942 | start = end + 1; |
872 | } | 943 | } |
@@ -919,17 +990,17 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, | |||
919 | return 0; | 990 | return 0; |
920 | } | 991 | } |
921 | 992 | ||
922 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | 993 | static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, |
923 | struct btrfs_root *root) | 994 | struct btrfs_root *root) |
924 | { | 995 | { |
925 | if (!trans || !trans->transaction) { | 996 | int ret; |
926 | struct inode *btree_inode; | 997 | |
927 | btree_inode = root->fs_info->btree_inode; | 998 | ret = btrfs_write_and_wait_marked_extents(root, |
928 | return filemap_write_and_wait(btree_inode->i_mapping); | ||
929 | } | ||
930 | return btrfs_write_and_wait_marked_extents(root, | ||
931 | &trans->transaction->dirty_pages, | 999 | &trans->transaction->dirty_pages, |
932 | EXTENT_DIRTY); | 1000 | EXTENT_DIRTY); |
1001 | clear_btree_io_tree(&trans->transaction->dirty_pages); | ||
1002 | |||
1003 | return ret; | ||
933 | } | 1004 | } |
934 | 1005 | ||
935 | /* | 1006 | /* |
@@ -1652,6 +1723,28 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) | |||
1652 | btrfs_wait_ordered_roots(fs_info, -1); | 1723 | btrfs_wait_ordered_roots(fs_info, -1); |
1653 | } | 1724 | } |
1654 | 1725 | ||
1726 | static inline void | ||
1727 | btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, | ||
1728 | struct btrfs_fs_info *fs_info) | ||
1729 | { | ||
1730 | struct btrfs_ordered_extent *ordered; | ||
1731 | |||
1732 | spin_lock(&fs_info->trans_lock); | ||
1733 | while (!list_empty(&cur_trans->pending_ordered)) { | ||
1734 | ordered = list_first_entry(&cur_trans->pending_ordered, | ||
1735 | struct btrfs_ordered_extent, | ||
1736 | trans_list); | ||
1737 | list_del_init(&ordered->trans_list); | ||
1738 | spin_unlock(&fs_info->trans_lock); | ||
1739 | |||
1740 | wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE, | ||
1741 | &ordered->flags)); | ||
1742 | btrfs_put_ordered_extent(ordered); | ||
1743 | spin_lock(&fs_info->trans_lock); | ||
1744 | } | ||
1745 | spin_unlock(&fs_info->trans_lock); | ||
1746 | } | ||
1747 | |||
1655 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | 1748 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, |
1656 | struct btrfs_root *root) | 1749 | struct btrfs_root *root) |
1657 | { | 1750 | { |
@@ -1702,6 +1795,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1702 | } | 1795 | } |
1703 | 1796 | ||
1704 | spin_lock(&root->fs_info->trans_lock); | 1797 | spin_lock(&root->fs_info->trans_lock); |
1798 | list_splice(&trans->ordered, &cur_trans->pending_ordered); | ||
1705 | if (cur_trans->state >= TRANS_STATE_COMMIT_START) { | 1799 | if (cur_trans->state >= TRANS_STATE_COMMIT_START) { |
1706 | spin_unlock(&root->fs_info->trans_lock); | 1800 | spin_unlock(&root->fs_info->trans_lock); |
1707 | atomic_inc(&cur_trans->use_count); | 1801 | atomic_inc(&cur_trans->use_count); |
@@ -1754,6 +1848,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1754 | 1848 | ||
1755 | btrfs_wait_delalloc_flush(root->fs_info); | 1849 | btrfs_wait_delalloc_flush(root->fs_info); |
1756 | 1850 | ||
1851 | btrfs_wait_pending_ordered(cur_trans, root->fs_info); | ||
1852 | |||
1757 | btrfs_scrub_pause(root); | 1853 | btrfs_scrub_pause(root); |
1758 | /* | 1854 | /* |
1759 | * Ok now we need to make sure to block out any other joins while we | 1855 | * Ok now we need to make sure to block out any other joins while we |
@@ -1842,13 +1938,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1842 | } | 1938 | } |
1843 | 1939 | ||
1844 | /* | 1940 | /* |
1845 | * Since the transaction is done, we should set the inode map cache flag | 1941 | * Since the transaction is done, we can apply the pending changes |
1846 | * before any other comming transaction. | 1942 | * before the next transaction. |
1847 | */ | 1943 | */ |
1848 | if (btrfs_test_opt(root, CHANGE_INODE_CACHE)) | 1944 | btrfs_apply_pending_changes(root->fs_info); |
1849 | btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); | ||
1850 | else | ||
1851 | btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE); | ||
1852 | 1945 | ||
1853 | /* commit_fs_roots gets rid of all the tree log roots, it is now | 1946 | /* commit_fs_roots gets rid of all the tree log roots, it is now |
1854 | * safe to free the root of tree log roots | 1947 | * safe to free the root of tree log roots |
@@ -2019,3 +2112,32 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) | |||
2019 | 2112 | ||
2020 | return (ret < 0) ? 0 : 1; | 2113 | return (ret < 0) ? 0 : 1; |
2021 | } | 2114 | } |
2115 | |||
2116 | void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) | ||
2117 | { | ||
2118 | unsigned long prev; | ||
2119 | unsigned long bit; | ||
2120 | |||
2121 | prev = cmpxchg(&fs_info->pending_changes, 0, 0); | ||
2122 | if (!prev) | ||
2123 | return; | ||
2124 | |||
2125 | bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE; | ||
2126 | if (prev & bit) | ||
2127 | btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE); | ||
2128 | prev &= ~bit; | ||
2129 | |||
2130 | bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE; | ||
2131 | if (prev & bit) | ||
2132 | btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE); | ||
2133 | prev &= ~bit; | ||
2134 | |||
2135 | bit = 1 << BTRFS_PENDING_COMMIT; | ||
2136 | if (prev & bit) | ||
2137 | btrfs_debug(fs_info, "pending commit done"); | ||
2138 | prev &= ~bit; | ||
2139 | |||
2140 | if (prev) | ||
2141 | btrfs_warn(fs_info, | ||
2142 | "unknown pending changes left 0x%lx, ignoring", prev); | ||
2143 | } | ||
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d8f40e1a5d2d..00ed29c4b3f9 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
@@ -56,6 +56,7 @@ struct btrfs_transaction { | |||
56 | wait_queue_head_t commit_wait; | 56 | wait_queue_head_t commit_wait; |
57 | struct list_head pending_snapshots; | 57 | struct list_head pending_snapshots; |
58 | struct list_head pending_chunks; | 58 | struct list_head pending_chunks; |
59 | struct list_head pending_ordered; | ||
59 | struct list_head switch_commits; | 60 | struct list_head switch_commits; |
60 | struct btrfs_delayed_ref_root delayed_refs; | 61 | struct btrfs_delayed_ref_root delayed_refs; |
61 | int aborted; | 62 | int aborted; |
@@ -105,6 +106,7 @@ struct btrfs_trans_handle { | |||
105 | */ | 106 | */ |
106 | struct btrfs_root *root; | 107 | struct btrfs_root *root; |
107 | struct seq_list delayed_ref_elem; | 108 | struct seq_list delayed_ref_elem; |
109 | struct list_head ordered; | ||
108 | struct list_head qgroup_ref_list; | 110 | struct list_head qgroup_ref_list; |
109 | struct list_head new_bgs; | 111 | struct list_head new_bgs; |
110 | }; | 112 | }; |
@@ -145,8 +147,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( | |||
145 | struct btrfs_root *root); | 147 | struct btrfs_root *root); |
146 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); | 148 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); |
147 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); | 149 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); |
148 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | ||
149 | struct btrfs_root *root); | ||
150 | 150 | ||
151 | void btrfs_add_dead_root(struct btrfs_root *root); | 151 | void btrfs_add_dead_root(struct btrfs_root *root); |
152 | int btrfs_defrag_root(struct btrfs_root *root); | 152 | int btrfs_defrag_root(struct btrfs_root *root); |
@@ -170,4 +170,6 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, | |||
170 | int btrfs_transaction_blocked(struct btrfs_fs_info *info); | 170 | int btrfs_transaction_blocked(struct btrfs_fs_info *info); |
171 | int btrfs_transaction_in_commit(struct btrfs_fs_info *info); | 171 | int btrfs_transaction_in_commit(struct btrfs_fs_info *info); |
172 | void btrfs_put_transaction(struct btrfs_transaction *transaction); | 172 | void btrfs_put_transaction(struct btrfs_transaction *transaction); |
173 | void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); | ||
174 | |||
173 | #endif | 175 | #endif |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 286213cec861..9a02da16f2be 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -2599,12 +2599,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2599 | index2 = root_log_ctx.log_transid % 2; | 2599 | index2 = root_log_ctx.log_transid % 2; |
2600 | if (atomic_read(&log_root_tree->log_commit[index2])) { | 2600 | if (atomic_read(&log_root_tree->log_commit[index2])) { |
2601 | blk_finish_plug(&plug); | 2601 | blk_finish_plug(&plug); |
2602 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2602 | ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, |
2603 | mark); | ||
2604 | btrfs_wait_logged_extents(trans, log, log_transid); | ||
2603 | wait_log_commit(trans, log_root_tree, | 2605 | wait_log_commit(trans, log_root_tree, |
2604 | root_log_ctx.log_transid); | 2606 | root_log_ctx.log_transid); |
2605 | btrfs_free_logged_extents(log, log_transid); | ||
2606 | mutex_unlock(&log_root_tree->log_mutex); | 2607 | mutex_unlock(&log_root_tree->log_mutex); |
2607 | ret = root_log_ctx.log_ret; | 2608 | if (!ret) |
2609 | ret = root_log_ctx.log_ret; | ||
2608 | goto out; | 2610 | goto out; |
2609 | } | 2611 | } |
2610 | ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); | 2612 | ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid); |
@@ -2641,11 +2643,18 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
2641 | mutex_unlock(&log_root_tree->log_mutex); | 2643 | mutex_unlock(&log_root_tree->log_mutex); |
2642 | goto out_wake_log_root; | 2644 | goto out_wake_log_root; |
2643 | } | 2645 | } |
2644 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2646 | ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
2645 | btrfs_wait_marked_extents(log_root_tree, | 2647 | if (!ret) |
2646 | &log_root_tree->dirty_log_pages, | 2648 | ret = btrfs_wait_marked_extents(log_root_tree, |
2647 | EXTENT_NEW | EXTENT_DIRTY); | 2649 | &log_root_tree->dirty_log_pages, |
2648 | btrfs_wait_logged_extents(log, log_transid); | 2650 | EXTENT_NEW | EXTENT_DIRTY); |
2651 | if (ret) { | ||
2652 | btrfs_set_log_full_commit(root->fs_info, trans); | ||
2653 | btrfs_free_logged_extents(log, log_transid); | ||
2654 | mutex_unlock(&log_root_tree->log_mutex); | ||
2655 | goto out_wake_log_root; | ||
2656 | } | ||
2657 | btrfs_wait_logged_extents(trans, log, log_transid); | ||
2649 | 2658 | ||
2650 | btrfs_set_super_log_root(root->fs_info->super_for_commit, | 2659 | btrfs_set_super_log_root(root->fs_info->super_for_commit, |
2651 | log_root_tree->node->start); | 2660 | log_root_tree->node->start); |
@@ -3626,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, | |||
3626 | test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); | 3635 | test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); |
3627 | 3636 | ||
3628 | if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { | 3637 | if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { |
3638 | /* | ||
3639 | * Clear the AS_EIO/AS_ENOSPC flags from the inode's | ||
3640 | * i_mapping flags, so that the next fsync won't get | ||
3641 | * an outdated io error too. | ||
3642 | */ | ||
3643 | btrfs_inode_check_errors(inode); | ||
3629 | *ordered_io_error = true; | 3644 | *ordered_io_error = true; |
3630 | break; | 3645 | break; |
3631 | } | 3646 | } |
@@ -3766,7 +3781,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
3766 | fi = btrfs_item_ptr(leaf, path->slots[0], | 3781 | fi = btrfs_item_ptr(leaf, path->slots[0], |
3767 | struct btrfs_file_extent_item); | 3782 | struct btrfs_file_extent_item); |
3768 | 3783 | ||
3769 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, | 3784 | btrfs_set_token_file_extent_generation(leaf, fi, trans->transid, |
3770 | &token); | 3785 | &token); |
3771 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) | 3786 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) |
3772 | btrfs_set_token_file_extent_type(leaf, fi, | 3787 | btrfs_set_token_file_extent_type(leaf, fi, |
@@ -3963,7 +3978,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3963 | 3978 | ||
3964 | mutex_lock(&BTRFS_I(inode)->log_mutex); | 3979 | mutex_lock(&BTRFS_I(inode)->log_mutex); |
3965 | 3980 | ||
3966 | btrfs_get_logged_extents(inode, &logged_list); | 3981 | btrfs_get_logged_extents(inode, &logged_list, start, end); |
3967 | 3982 | ||
3968 | /* | 3983 | /* |
3969 | * a brute force approach to making sure we get the most uptodate | 3984 | * a brute force approach to making sure we get the most uptodate |
@@ -4089,6 +4104,21 @@ log_extents: | |||
4089 | btrfs_release_path(path); | 4104 | btrfs_release_path(path); |
4090 | btrfs_release_path(dst_path); | 4105 | btrfs_release_path(dst_path); |
4091 | if (fast_search) { | 4106 | if (fast_search) { |
4107 | /* | ||
4108 | * Some ordered extents started by fsync might have completed | ||
4109 | * before we collected the ordered extents in logged_list, which | ||
4110 | * means they're gone, not in our logged_list nor in the inode's | ||
4111 | * ordered tree. We want the application/user space to know an | ||
4112 | * error happened while attempting to persist file data so that | ||
4113 | * it can take proper action. If such error happened, we leave | ||
4114 | * without writing to the log tree and the fsync must report the | ||
4115 | * file data write error and not commit the current transaction. | ||
4116 | */ | ||
4117 | err = btrfs_inode_check_errors(inode); | ||
4118 | if (err) { | ||
4119 | ctx->io_err = err; | ||
4120 | goto out_unlock; | ||
4121 | } | ||
4092 | ret = btrfs_log_changed_extents(trans, root, inode, dst_path, | 4122 | ret = btrfs_log_changed_extents(trans, root, inode, dst_path, |
4093 | &logged_list, ctx); | 4123 | &logged_list, ctx); |
4094 | if (ret) { | 4124 | if (ret) { |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d47289c715c8..0144790e296e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -53,16 +53,6 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); | |||
53 | DEFINE_MUTEX(uuid_mutex); | 53 | DEFINE_MUTEX(uuid_mutex); |
54 | static LIST_HEAD(fs_uuids); | 54 | static LIST_HEAD(fs_uuids); |
55 | 55 | ||
56 | static void lock_chunks(struct btrfs_root *root) | ||
57 | { | ||
58 | mutex_lock(&root->fs_info->chunk_mutex); | ||
59 | } | ||
60 | |||
61 | static void unlock_chunks(struct btrfs_root *root) | ||
62 | { | ||
63 | mutex_unlock(&root->fs_info->chunk_mutex); | ||
64 | } | ||
65 | |||
66 | static struct btrfs_fs_devices *__alloc_fs_devices(void) | 56 | static struct btrfs_fs_devices *__alloc_fs_devices(void) |
67 | { | 57 | { |
68 | struct btrfs_fs_devices *fs_devs; | 58 | struct btrfs_fs_devices *fs_devs; |
@@ -1068,9 +1058,11 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, | |||
1068 | u64 *start, u64 len) | 1058 | u64 *start, u64 len) |
1069 | { | 1059 | { |
1070 | struct extent_map *em; | 1060 | struct extent_map *em; |
1061 | struct list_head *search_list = &trans->transaction->pending_chunks; | ||
1071 | int ret = 0; | 1062 | int ret = 0; |
1072 | 1063 | ||
1073 | list_for_each_entry(em, &trans->transaction->pending_chunks, list) { | 1064 | again: |
1065 | list_for_each_entry(em, search_list, list) { | ||
1074 | struct map_lookup *map; | 1066 | struct map_lookup *map; |
1075 | int i; | 1067 | int i; |
1076 | 1068 | ||
@@ -1087,6 +1079,10 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, | |||
1087 | ret = 1; | 1079 | ret = 1; |
1088 | } | 1080 | } |
1089 | } | 1081 | } |
1082 | if (search_list == &trans->transaction->pending_chunks) { | ||
1083 | search_list = &trans->root->fs_info->pinned_chunks; | ||
1084 | goto again; | ||
1085 | } | ||
1090 | 1086 | ||
1091 | return ret; | 1087 | return ret; |
1092 | } | 1088 | } |
@@ -1800,8 +1796,8 @@ error_undo: | |||
1800 | goto error_brelse; | 1796 | goto error_brelse; |
1801 | } | 1797 | } |
1802 | 1798 | ||
1803 | void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, | 1799 | void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, |
1804 | struct btrfs_device *srcdev) | 1800 | struct btrfs_device *srcdev) |
1805 | { | 1801 | { |
1806 | struct btrfs_fs_devices *fs_devices; | 1802 | struct btrfs_fs_devices *fs_devices; |
1807 | 1803 | ||
@@ -1829,6 +1825,12 @@ void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, | |||
1829 | 1825 | ||
1830 | if (srcdev->bdev) | 1826 | if (srcdev->bdev) |
1831 | fs_devices->open_devices--; | 1827 | fs_devices->open_devices--; |
1828 | } | ||
1829 | |||
1830 | void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, | ||
1831 | struct btrfs_device *srcdev) | ||
1832 | { | ||
1833 | struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; | ||
1832 | 1834 | ||
1833 | call_rcu(&srcdev->rcu, free_device); | 1835 | call_rcu(&srcdev->rcu, free_device); |
1834 | 1836 | ||
@@ -2647,18 +2649,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, | |||
2647 | } | 2649 | } |
2648 | } | 2650 | } |
2649 | 2651 | ||
2650 | ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); | 2652 | ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); |
2651 | if (ret) { | 2653 | if (ret) { |
2652 | btrfs_abort_transaction(trans, extent_root, ret); | 2654 | btrfs_abort_transaction(trans, extent_root, ret); |
2653 | goto out; | 2655 | goto out; |
2654 | } | 2656 | } |
2655 | 2657 | ||
2656 | write_lock(&em_tree->lock); | ||
2657 | remove_extent_mapping(em_tree, em); | ||
2658 | write_unlock(&em_tree->lock); | ||
2659 | |||
2660 | /* once for the tree */ | ||
2661 | free_extent_map(em); | ||
2662 | out: | 2658 | out: |
2663 | /* once for us */ | 2659 | /* once for us */ |
2664 | free_extent_map(em); | 2660 | free_extent_map(em); |
@@ -4505,6 +4501,8 @@ error_del_extent: | |||
4505 | free_extent_map(em); | 4501 | free_extent_map(em); |
4506 | /* One for the tree reference */ | 4502 | /* One for the tree reference */ |
4507 | free_extent_map(em); | 4503 | free_extent_map(em); |
4504 | /* One for the pending_chunks list reference */ | ||
4505 | free_extent_map(em); | ||
4508 | error: | 4506 | error: |
4509 | kfree(devices_info); | 4507 | kfree(devices_info); |
4510 | return ret; | 4508 | return ret; |
@@ -4881,13 +4879,15 @@ static inline int parity_smaller(u64 a, u64 b) | |||
4881 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | 4879 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) |
4882 | { | 4880 | { |
4883 | struct btrfs_bio_stripe s; | 4881 | struct btrfs_bio_stripe s; |
4882 | int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; | ||
4884 | int i; | 4883 | int i; |
4885 | u64 l; | 4884 | u64 l; |
4886 | int again = 1; | 4885 | int again = 1; |
4886 | int m; | ||
4887 | 4887 | ||
4888 | while (again) { | 4888 | while (again) { |
4889 | again = 0; | 4889 | again = 0; |
4890 | for (i = 0; i < bbio->num_stripes - 1; i++) { | 4890 | for (i = 0; i < real_stripes - 1; i++) { |
4891 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | 4891 | if (parity_smaller(raid_map[i], raid_map[i+1])) { |
4892 | s = bbio->stripes[i]; | 4892 | s = bbio->stripes[i]; |
4893 | l = raid_map[i]; | 4893 | l = raid_map[i]; |
@@ -4895,6 +4895,14 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | |||
4895 | raid_map[i] = raid_map[i+1]; | 4895 | raid_map[i] = raid_map[i+1]; |
4896 | bbio->stripes[i+1] = s; | 4896 | bbio->stripes[i+1] = s; |
4897 | raid_map[i+1] = l; | 4897 | raid_map[i+1] = l; |
4898 | |||
4899 | if (bbio->tgtdev_map) { | ||
4900 | m = bbio->tgtdev_map[i]; | ||
4901 | bbio->tgtdev_map[i] = | ||
4902 | bbio->tgtdev_map[i + 1]; | ||
4903 | bbio->tgtdev_map[i + 1] = m; | ||
4904 | } | ||
4905 | |||
4898 | again = 1; | 4906 | again = 1; |
4899 | } | 4907 | } |
4900 | } | 4908 | } |
@@ -4923,6 +4931,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
4923 | int ret = 0; | 4931 | int ret = 0; |
4924 | int num_stripes; | 4932 | int num_stripes; |
4925 | int max_errors = 0; | 4933 | int max_errors = 0; |
4934 | int tgtdev_indexes = 0; | ||
4926 | struct btrfs_bio *bbio = NULL; | 4935 | struct btrfs_bio *bbio = NULL; |
4927 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; | 4936 | struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; |
4928 | int dev_replace_is_ongoing = 0; | 4937 | int dev_replace_is_ongoing = 0; |
@@ -5161,15 +5170,14 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5161 | BTRFS_BLOCK_GROUP_RAID6)) { | 5170 | BTRFS_BLOCK_GROUP_RAID6)) { |
5162 | u64 tmp; | 5171 | u64 tmp; |
5163 | 5172 | ||
5164 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | 5173 | if (raid_map_ret && |
5165 | && raid_map_ret) { | 5174 | ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || |
5175 | mirror_num > 1)) { | ||
5166 | int i, rot; | 5176 | int i, rot; |
5167 | 5177 | ||
5168 | /* push stripe_nr back to the start of the full stripe */ | 5178 | /* push stripe_nr back to the start of the full stripe */ |
5169 | stripe_nr = raid56_full_stripe_start; | 5179 | stripe_nr = raid56_full_stripe_start; |
5170 | do_div(stripe_nr, stripe_len); | 5180 | do_div(stripe_nr, stripe_len * nr_data_stripes(map)); |
5171 | |||
5172 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
5173 | 5181 | ||
5174 | /* RAID[56] write or recovery. Return all stripes */ | 5182 | /* RAID[56] write or recovery. Return all stripes */ |
5175 | num_stripes = map->num_stripes; | 5183 | num_stripes = map->num_stripes; |
@@ -5235,14 +5243,19 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5235 | num_alloc_stripes <<= 1; | 5243 | num_alloc_stripes <<= 1; |
5236 | if (rw & REQ_GET_READ_MIRRORS) | 5244 | if (rw & REQ_GET_READ_MIRRORS) |
5237 | num_alloc_stripes++; | 5245 | num_alloc_stripes++; |
5246 | tgtdev_indexes = num_stripes; | ||
5238 | } | 5247 | } |
5239 | bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS); | 5248 | |
5249 | bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes), | ||
5250 | GFP_NOFS); | ||
5240 | if (!bbio) { | 5251 | if (!bbio) { |
5241 | kfree(raid_map); | 5252 | kfree(raid_map); |
5242 | ret = -ENOMEM; | 5253 | ret = -ENOMEM; |
5243 | goto out; | 5254 | goto out; |
5244 | } | 5255 | } |
5245 | atomic_set(&bbio->error, 0); | 5256 | atomic_set(&bbio->error, 0); |
5257 | if (dev_replace_is_ongoing) | ||
5258 | bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); | ||
5246 | 5259 | ||
5247 | if (rw & REQ_DISCARD) { | 5260 | if (rw & REQ_DISCARD) { |
5248 | int factor = 0; | 5261 | int factor = 0; |
@@ -5327,6 +5340,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5327 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) | 5340 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) |
5328 | max_errors = btrfs_chunk_max_errors(map); | 5341 | max_errors = btrfs_chunk_max_errors(map); |
5329 | 5342 | ||
5343 | tgtdev_indexes = 0; | ||
5330 | if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && | 5344 | if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && |
5331 | dev_replace->tgtdev != NULL) { | 5345 | dev_replace->tgtdev != NULL) { |
5332 | int index_where_to_add; | 5346 | int index_where_to_add; |
@@ -5355,8 +5369,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5355 | new->physical = old->physical; | 5369 | new->physical = old->physical; |
5356 | new->length = old->length; | 5370 | new->length = old->length; |
5357 | new->dev = dev_replace->tgtdev; | 5371 | new->dev = dev_replace->tgtdev; |
5372 | bbio->tgtdev_map[i] = index_where_to_add; | ||
5358 | index_where_to_add++; | 5373 | index_where_to_add++; |
5359 | max_errors++; | 5374 | max_errors++; |
5375 | tgtdev_indexes++; | ||
5360 | } | 5376 | } |
5361 | } | 5377 | } |
5362 | num_stripes = index_where_to_add; | 5378 | num_stripes = index_where_to_add; |
@@ -5402,7 +5418,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5402 | tgtdev_stripe->length = | 5418 | tgtdev_stripe->length = |
5403 | bbio->stripes[index_srcdev].length; | 5419 | bbio->stripes[index_srcdev].length; |
5404 | tgtdev_stripe->dev = dev_replace->tgtdev; | 5420 | tgtdev_stripe->dev = dev_replace->tgtdev; |
5421 | bbio->tgtdev_map[index_srcdev] = num_stripes; | ||
5405 | 5422 | ||
5423 | tgtdev_indexes++; | ||
5406 | num_stripes++; | 5424 | num_stripes++; |
5407 | } | 5425 | } |
5408 | } | 5426 | } |
@@ -5412,6 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5412 | bbio->num_stripes = num_stripes; | 5430 | bbio->num_stripes = num_stripes; |
5413 | bbio->max_errors = max_errors; | 5431 | bbio->max_errors = max_errors; |
5414 | bbio->mirror_num = mirror_num; | 5432 | bbio->mirror_num = mirror_num; |
5433 | bbio->num_tgtdevs = tgtdev_indexes; | ||
5415 | 5434 | ||
5416 | /* | 5435 | /* |
5417 | * this is the case that REQ_READ && dev_replace_is_ongoing && | 5436 | * this is the case that REQ_READ && dev_replace_is_ongoing && |
@@ -5443,6 +5462,16 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
5443 | mirror_num, NULL); | 5462 | mirror_num, NULL); |
5444 | } | 5463 | } |
5445 | 5464 | ||
5465 | /* For Scrub/replace */ | ||
5466 | int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, | ||
5467 | u64 logical, u64 *length, | ||
5468 | struct btrfs_bio **bbio_ret, int mirror_num, | ||
5469 | u64 **raid_map_ret) | ||
5470 | { | ||
5471 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | ||
5472 | mirror_num, raid_map_ret); | ||
5473 | } | ||
5474 | |||
5446 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 5475 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
5447 | u64 chunk_start, u64 physical, u64 devid, | 5476 | u64 chunk_start, u64 physical, u64 devid, |
5448 | u64 **logical, int *naddrs, int *stripe_len) | 5477 | u64 **logical, int *naddrs, int *stripe_len) |
@@ -5812,12 +5841,9 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
5812 | } else { | 5841 | } else { |
5813 | ret = raid56_parity_recover(root, bio, bbio, | 5842 | ret = raid56_parity_recover(root, bio, bbio, |
5814 | raid_map, map_length, | 5843 | raid_map, map_length, |
5815 | mirror_num); | 5844 | mirror_num, 1); |
5816 | } | 5845 | } |
5817 | /* | 5846 | |
5818 | * FIXME, replace dosen't support raid56 yet, please fix | ||
5819 | * it in the future. | ||
5820 | */ | ||
5821 | btrfs_bio_counter_dec(root->fs_info); | 5847 | btrfs_bio_counter_dec(root->fs_info); |
5822 | return ret; | 5848 | return ret; |
5823 | } | 5849 | } |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 08980fa23039..d6fe73c0f4a2 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
@@ -292,7 +292,7 @@ struct btrfs_bio_stripe { | |||
292 | struct btrfs_bio; | 292 | struct btrfs_bio; |
293 | typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); | 293 | typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); |
294 | 294 | ||
295 | #define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1 | 295 | #define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) |
296 | 296 | ||
297 | struct btrfs_bio { | 297 | struct btrfs_bio { |
298 | atomic_t stripes_pending; | 298 | atomic_t stripes_pending; |
@@ -305,6 +305,8 @@ struct btrfs_bio { | |||
305 | int max_errors; | 305 | int max_errors; |
306 | int num_stripes; | 306 | int num_stripes; |
307 | int mirror_num; | 307 | int mirror_num; |
308 | int num_tgtdevs; | ||
309 | int *tgtdev_map; | ||
308 | struct btrfs_bio_stripe stripes[]; | 310 | struct btrfs_bio_stripe stripes[]; |
309 | }; | 311 | }; |
310 | 312 | ||
@@ -387,12 +389,18 @@ struct btrfs_balance_control { | |||
387 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, | 389 | int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, |
388 | u64 end, u64 *length); | 390 | u64 end, u64 *length); |
389 | 391 | ||
390 | #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \ | 392 | #define btrfs_bio_size(total_stripes, real_stripes) \ |
391 | (sizeof(struct btrfs_bio_stripe) * (n))) | 393 | (sizeof(struct btrfs_bio) + \ |
394 | (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \ | ||
395 | (sizeof(int) * (real_stripes))) | ||
392 | 396 | ||
393 | int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 397 | int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
394 | u64 logical, u64 *length, | 398 | u64 logical, u64 *length, |
395 | struct btrfs_bio **bbio_ret, int mirror_num); | 399 | struct btrfs_bio **bbio_ret, int mirror_num); |
400 | int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw, | ||
401 | u64 logical, u64 *length, | ||
402 | struct btrfs_bio **bbio_ret, int mirror_num, | ||
403 | u64 **raid_map_ret); | ||
396 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 404 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
397 | u64 chunk_start, u64 physical, u64 devid, | 405 | u64 chunk_start, u64 physical, u64 devid, |
398 | u64 **logical, int *naddrs, int *stripe_len); | 406 | u64 **logical, int *naddrs, int *stripe_len); |
@@ -448,8 +456,10 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); | |||
448 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); | 456 | int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); |
449 | int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, | 457 | int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, |
450 | struct btrfs_fs_info *fs_info); | 458 | struct btrfs_fs_info *fs_info); |
451 | void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, | 459 | void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, |
452 | struct btrfs_device *srcdev); | 460 | struct btrfs_device *srcdev); |
461 | void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info, | ||
462 | struct btrfs_device *srcdev); | ||
453 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | 463 | void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, |
454 | struct btrfs_device *tgtdev); | 464 | struct btrfs_device *tgtdev); |
455 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | 465 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, |
@@ -513,4 +523,16 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, | |||
513 | void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); | 523 | void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); |
514 | void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, | 524 | void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, |
515 | struct btrfs_transaction *transaction); | 525 | struct btrfs_transaction *transaction); |
526 | |||
527 | static inline void lock_chunks(struct btrfs_root *root) | ||
528 | { | ||
529 | mutex_lock(&root->fs_info->chunk_mutex); | ||
530 | } | ||
531 | |||
532 | static inline void unlock_chunks(struct btrfs_root *root) | ||
533 | { | ||
534 | mutex_unlock(&root->fs_info->chunk_mutex); | ||
535 | } | ||
536 | |||
537 | |||
516 | #endif | 538 | #endif |
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index dcf20131fbe4..47b19465f0dc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include "xattr.h" | 29 | #include "xattr.h" |
30 | #include "disk-io.h" | 30 | #include "disk-io.h" |
31 | #include "props.h" | 31 | #include "props.h" |
32 | #include "locking.h" | ||
32 | 33 | ||
33 | 34 | ||
34 | ssize_t __btrfs_getxattr(struct inode *inode, const char *name, | 35 | ssize_t __btrfs_getxattr(struct inode *inode, const char *name, |
@@ -91,7 +92,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, | |||
91 | struct inode *inode, const char *name, | 92 | struct inode *inode, const char *name, |
92 | const void *value, size_t size, int flags) | 93 | const void *value, size_t size, int flags) |
93 | { | 94 | { |
94 | struct btrfs_dir_item *di; | 95 | struct btrfs_dir_item *di = NULL; |
95 | struct btrfs_root *root = BTRFS_I(inode)->root; | 96 | struct btrfs_root *root = BTRFS_I(inode)->root; |
96 | struct btrfs_path *path; | 97 | struct btrfs_path *path; |
97 | size_t name_len = strlen(name); | 98 | size_t name_len = strlen(name); |
@@ -103,84 +104,119 @@ static int do_setxattr(struct btrfs_trans_handle *trans, | |||
103 | path = btrfs_alloc_path(); | 104 | path = btrfs_alloc_path(); |
104 | if (!path) | 105 | if (!path) |
105 | return -ENOMEM; | 106 | return -ENOMEM; |
107 | path->skip_release_on_error = 1; | ||
108 | |||
109 | if (!value) { | ||
110 | di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), | ||
111 | name, name_len, -1); | ||
112 | if (!di && (flags & XATTR_REPLACE)) | ||
113 | ret = -ENODATA; | ||
114 | else if (di) | ||
115 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
116 | goto out; | ||
117 | } | ||
106 | 118 | ||
119 | /* | ||
120 | * For a replace we can't just do the insert blindly. | ||
121 | * Do a lookup first (read-only btrfs_search_slot), and return if xattr | ||
122 | * doesn't exist. If it exists, fall down below to the insert/replace | ||
123 | * path - we can't race with a concurrent xattr delete, because the VFS | ||
124 | * locks the inode's i_mutex before calling setxattr or removexattr. | ||
125 | */ | ||
107 | if (flags & XATTR_REPLACE) { | 126 | if (flags & XATTR_REPLACE) { |
108 | di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, | 127 | ASSERT(mutex_is_locked(&inode->i_mutex)); |
109 | name_len, -1); | 128 | di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), |
110 | if (IS_ERR(di)) { | 129 | name, name_len, 0); |
111 | ret = PTR_ERR(di); | 130 | if (!di) { |
112 | goto out; | ||
113 | } else if (!di) { | ||
114 | ret = -ENODATA; | 131 | ret = -ENODATA; |
115 | goto out; | 132 | goto out; |
116 | } | 133 | } |
117 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
118 | if (ret) | ||
119 | goto out; | ||
120 | btrfs_release_path(path); | 134 | btrfs_release_path(path); |
135 | di = NULL; | ||
136 | } | ||
121 | 137 | ||
138 | ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), | ||
139 | name, name_len, value, size); | ||
140 | if (ret == -EOVERFLOW) { | ||
122 | /* | 141 | /* |
123 | * remove the attribute | 142 | * We have an existing item in a leaf, split_leaf couldn't |
143 | * expand it. That item might have or not a dir_item that | ||
144 | * matches our target xattr, so lets check. | ||
124 | */ | 145 | */ |
125 | if (!value) | 146 | ret = 0; |
126 | goto out; | 147 | btrfs_assert_tree_locked(path->nodes[0]); |
127 | } else { | 148 | di = btrfs_match_dir_item_name(root, path, name, name_len); |
128 | di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), | 149 | if (!di && !(flags & XATTR_REPLACE)) { |
129 | name, name_len, 0); | 150 | ret = -ENOSPC; |
130 | if (IS_ERR(di)) { | ||
131 | ret = PTR_ERR(di); | ||
132 | goto out; | 151 | goto out; |
133 | } | 152 | } |
134 | if (!di && !value) | 153 | } else if (ret == -EEXIST) { |
135 | goto out; | 154 | ret = 0; |
136 | btrfs_release_path(path); | 155 | di = btrfs_match_dir_item_name(root, path, name, name_len); |
156 | ASSERT(di); /* logic error */ | ||
157 | } else if (ret) { | ||
158 | goto out; | ||
137 | } | 159 | } |
138 | 160 | ||
139 | again: | 161 | if (di && (flags & XATTR_CREATE)) { |
140 | ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), | ||
141 | name, name_len, value, size); | ||
142 | /* | ||
143 | * If we're setting an xattr to a new value but the new value is say | ||
144 | * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting | ||
145 | * back from split_leaf. This is because it thinks we'll be extending | ||
146 | * the existing item size, but we're asking for enough space to add the | ||
147 | * item itself. So if we get EOVERFLOW just set ret to EEXIST and let | ||
148 | * the rest of the function figure it out. | ||
149 | */ | ||
150 | if (ret == -EOVERFLOW) | ||
151 | ret = -EEXIST; | 162 | ret = -EEXIST; |
163 | goto out; | ||
164 | } | ||
152 | 165 | ||
153 | if (ret == -EEXIST) { | 166 | if (di) { |
154 | if (flags & XATTR_CREATE) | ||
155 | goto out; | ||
156 | /* | 167 | /* |
157 | * We can't use the path we already have since we won't have the | 168 | * We're doing a replace, and it must be atomic, that is, at |
158 | * proper locking for a delete, so release the path and | 169 | * any point in time we have either the old or the new xattr |
159 | * re-lookup to delete the thing. | 170 | * value in the tree. We don't want readers (getxattr and |
171 | * listxattrs) to miss a value, this is specially important | ||
172 | * for ACLs. | ||
160 | */ | 173 | */ |
161 | btrfs_release_path(path); | 174 | const int slot = path->slots[0]; |
162 | di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), | 175 | struct extent_buffer *leaf = path->nodes[0]; |
163 | name, name_len, -1); | 176 | const u16 old_data_len = btrfs_dir_data_len(leaf, di); |
164 | if (IS_ERR(di)) { | 177 | const u32 item_size = btrfs_item_size_nr(leaf, slot); |
165 | ret = PTR_ERR(di); | 178 | const u32 data_size = sizeof(*di) + name_len + size; |
166 | goto out; | 179 | struct btrfs_item *item; |
167 | } else if (!di) { | 180 | unsigned long data_ptr; |
168 | /* Shouldn't happen but just in case... */ | 181 | char *ptr; |
169 | btrfs_release_path(path); | 182 | |
170 | goto again; | 183 | if (size > old_data_len) { |
184 | if (btrfs_leaf_free_space(root, leaf) < | ||
185 | (size - old_data_len)) { | ||
186 | ret = -ENOSPC; | ||
187 | goto out; | ||
188 | } | ||
171 | } | 189 | } |
172 | 190 | ||
173 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | 191 | if (old_data_len + name_len + sizeof(*di) == item_size) { |
174 | if (ret) | 192 | /* No other xattrs packed in the same leaf item. */ |
175 | goto out; | 193 | if (size > old_data_len) |
194 | btrfs_extend_item(root, path, | ||
195 | size - old_data_len); | ||
196 | else if (size < old_data_len) | ||
197 | btrfs_truncate_item(root, path, data_size, 1); | ||
198 | } else { | ||
199 | /* There are other xattrs packed in the same item. */ | ||
200 | ret = btrfs_delete_one_dir_name(trans, root, path, di); | ||
201 | if (ret) | ||
202 | goto out; | ||
203 | btrfs_extend_item(root, path, data_size); | ||
204 | } | ||
176 | 205 | ||
206 | item = btrfs_item_nr(slot); | ||
207 | ptr = btrfs_item_ptr(leaf, slot, char); | ||
208 | ptr += btrfs_item_size(leaf, item) - data_size; | ||
209 | di = (struct btrfs_dir_item *)ptr; | ||
210 | btrfs_set_dir_data_len(leaf, di, size); | ||
211 | data_ptr = ((unsigned long)(di + 1)) + name_len; | ||
212 | write_extent_buffer(leaf, value, data_ptr, size); | ||
213 | btrfs_mark_buffer_dirty(leaf); | ||
214 | } else { | ||
177 | /* | 215 | /* |
178 | * We have a value to set, so go back and try to insert it now. | 216 | * Insert, and we had space for the xattr, so path->slots[0] is |
217 | * where our xattr dir_item is and btrfs_insert_xattr_item() | ||
218 | * filled it. | ||
179 | */ | 219 | */ |
180 | if (value) { | ||
181 | btrfs_release_path(path); | ||
182 | goto again; | ||
183 | } | ||
184 | } | 220 | } |
185 | out: | 221 | out: |
186 | btrfs_free_path(path); | 222 | btrfs_free_path(path); |