diff options
author | Chris Mason <chris.mason@oracle.com> | 2009-09-11 12:27:37 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2009-09-11 13:31:07 -0400 |
commit | a1ed835e1ab5795f91b198d08c43e2f56848dcf3 (patch) | |
tree | ac3b370823fa76c5be7698e3663306badbbd622d | |
parent | 8b62b72b26bcd72082c4a69d179dd906bcc22200 (diff) |
Btrfs: Fix extent replacment race
Data COW means that whenever we write to a file, we replace any old
extent pointers with new ones. There was a window where a readpage
might find the old extent pointers on disk and cache them in the
extent_map tree in ram in the middle of a given write replacing them.
Even though both the readpage and the write had their respective bytes
in the file locked, the extent readpage inserts may cover more bytes than
it had locked down.
This commit closes the race by keeping the new extent pinned in the extent
map tree until after the on-disk btree is properly setup with the new
extent pointers.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r-- | fs/btrfs/ctree.h | 2 | ||||
-rw-r--r-- | fs/btrfs/extent_map.c | 50 | ||||
-rw-r--r-- | fs/btrfs/extent_map.h | 1 | ||||
-rw-r--r-- | fs/btrfs/file.c | 8 | ||||
-rw-r--r-- | fs/btrfs/inode.c | 25 | ||||
-rw-r--r-- | fs/btrfs/ioctl.c | 5 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 2 |
7 files changed, 80 insertions, 13 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 38eeb6c49c8a..1ceab8b4d6dc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -2292,7 +2292,7 @@ extern struct file_operations btrfs_file_operations; | |||
2292 | int btrfs_drop_extents(struct btrfs_trans_handle *trans, | 2292 | int btrfs_drop_extents(struct btrfs_trans_handle *trans, |
2293 | struct btrfs_root *root, struct inode *inode, | 2293 | struct btrfs_root *root, struct inode *inode, |
2294 | u64 start, u64 end, u64 locked_end, | 2294 | u64 start, u64 end, u64 locked_end, |
2295 | u64 inline_limit, u64 *hint_block); | 2295 | u64 inline_limit, u64 *hint_block, int drop_cache); |
2296 | int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, | 2296 | int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, |
2297 | struct btrfs_root *root, | 2297 | struct btrfs_root *root, |
2298 | struct inode *inode, u64 start, u64 end); | 2298 | struct inode *inode, u64 start, u64 end); |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 72e9fa3c31f5..5bc7a0d325e7 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) | |||
198 | return 0; | 198 | return 0; |
199 | } | 199 | } |
200 | 200 | ||
201 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) | ||
202 | { | ||
203 | int ret = 0; | ||
204 | struct extent_map *merge = NULL; | ||
205 | struct rb_node *rb; | ||
206 | struct extent_map *em; | ||
207 | |||
208 | write_lock(&tree->lock); | ||
209 | em = lookup_extent_mapping(tree, start, len); | ||
210 | |||
211 | WARN_ON(em->start != start || !em); | ||
212 | |||
213 | if (!em) | ||
214 | goto out; | ||
215 | |||
216 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
217 | |||
218 | if (em->start != 0) { | ||
219 | rb = rb_prev(&em->rb_node); | ||
220 | if (rb) | ||
221 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
222 | if (rb && mergable_maps(merge, em)) { | ||
223 | em->start = merge->start; | ||
224 | em->len += merge->len; | ||
225 | em->block_len += merge->block_len; | ||
226 | em->block_start = merge->block_start; | ||
227 | merge->in_tree = 0; | ||
228 | rb_erase(&merge->rb_node, &tree->map); | ||
229 | free_extent_map(merge); | ||
230 | } | ||
231 | } | ||
232 | |||
233 | rb = rb_next(&em->rb_node); | ||
234 | if (rb) | ||
235 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
236 | if (rb && mergable_maps(em, merge)) { | ||
237 | em->len += merge->len; | ||
238 | em->block_len += merge->len; | ||
239 | rb_erase(&merge->rb_node, &tree->map); | ||
240 | merge->in_tree = 0; | ||
241 | free_extent_map(merge); | ||
242 | } | ||
243 | |||
244 | free_extent_map(em); | ||
245 | out: | ||
246 | write_unlock(&tree->lock); | ||
247 | return ret; | ||
248 | |||
249 | } | ||
250 | |||
201 | /** | 251 | /** |
202 | * add_extent_mapping - add new extent map to the extent tree | 252 | * add_extent_mapping - add new extent map to the extent tree |
203 | * @tree: tree to insert new map in | 253 | * @tree: tree to insert new map in |
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 6216dfbcf9be..d3d442f4bbbd 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h | |||
@@ -59,4 +59,5 @@ struct extent_map *alloc_extent_map(gfp_t mask); | |||
59 | void free_extent_map(struct extent_map *em); | 59 | void free_extent_map(struct extent_map *em); |
60 | int __init extent_map_init(void); | 60 | int __init extent_map_init(void); |
61 | void extent_map_exit(void); | 61 | void extent_map_exit(void); |
62 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len); | ||
62 | #endif | 63 | #endif |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ef66c3d989b9..4123db9d5141 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -177,10 +177,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
177 | } | 177 | } |
178 | flags = em->flags; | 178 | flags = em->flags; |
179 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { | 179 | if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { |
180 | write_unlock(&em_tree->lock); | ||
181 | if (em->start <= start && | 180 | if (em->start <= start && |
182 | (!testend || em->start + em->len >= start + len)) { | 181 | (!testend || em->start + em->len >= start + len)) { |
183 | free_extent_map(em); | 182 | free_extent_map(em); |
183 | write_unlock(&em_tree->lock); | ||
184 | break; | 184 | break; |
185 | } | 185 | } |
186 | if (start < em->start) { | 186 | if (start < em->start) { |
@@ -190,6 +190,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
190 | start = em->start + em->len; | 190 | start = em->start + em->len; |
191 | } | 191 | } |
192 | free_extent_map(em); | 192 | free_extent_map(em); |
193 | write_unlock(&em_tree->lock); | ||
193 | continue; | 194 | continue; |
194 | } | 195 | } |
195 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | 196 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); |
@@ -269,7 +270,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, | |||
269 | noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, | 270 | noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, |
270 | struct btrfs_root *root, struct inode *inode, | 271 | struct btrfs_root *root, struct inode *inode, |
271 | u64 start, u64 end, u64 locked_end, | 272 | u64 start, u64 end, u64 locked_end, |
272 | u64 inline_limit, u64 *hint_byte) | 273 | u64 inline_limit, u64 *hint_byte, int drop_cache) |
273 | { | 274 | { |
274 | u64 extent_end = 0; | 275 | u64 extent_end = 0; |
275 | u64 search_start = start; | 276 | u64 search_start = start; |
@@ -294,7 +295,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, | |||
294 | int ret; | 295 | int ret; |
295 | 296 | ||
296 | inline_limit = 0; | 297 | inline_limit = 0; |
297 | btrfs_drop_extent_cache(inode, start, end - 1, 0); | 298 | if (drop_cache) |
299 | btrfs_drop_extent_cache(inode, start, end - 1, 0); | ||
298 | 300 | ||
299 | path = btrfs_alloc_path(); | 301 | path = btrfs_alloc_path(); |
300 | if (!path) | 302 | if (!path) |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 739a245e25d6..233fe6f26120 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -232,7 +232,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
232 | } | 232 | } |
233 | 233 | ||
234 | ret = btrfs_drop_extents(trans, root, inode, start, | 234 | ret = btrfs_drop_extents(trans, root, inode, start, |
235 | aligned_end, aligned_end, start, &hint_byte); | 235 | aligned_end, aligned_end, start, |
236 | &hint_byte, 1); | ||
236 | BUG_ON(ret); | 237 | BUG_ON(ret); |
237 | 238 | ||
238 | if (isize > actual_end) | 239 | if (isize > actual_end) |
@@ -241,7 +242,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
241 | inline_len, compressed_size, | 242 | inline_len, compressed_size, |
242 | compressed_pages); | 243 | compressed_pages); |
243 | BUG_ON(ret); | 244 | BUG_ON(ret); |
244 | btrfs_drop_extent_cache(inode, start, aligned_end, 0); | 245 | btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); |
245 | return 0; | 246 | return 0; |
246 | } | 247 | } |
247 | 248 | ||
@@ -1455,9 +1456,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
1455 | BUG_ON(!path); | 1456 | BUG_ON(!path); |
1456 | 1457 | ||
1457 | path->leave_spinning = 1; | 1458 | path->leave_spinning = 1; |
1459 | |||
1460 | /* | ||
1461 | * we may be replacing one extent in the tree with another. | ||
1462 | * The new extent is pinned in the extent map, and we don't want | ||
1463 | * to drop it from the cache until it is completely in the btree. | ||
1464 | * | ||
1465 | * So, tell btrfs_drop_extents to leave this extent in the cache. | ||
1466 | * the caller is expected to unpin it and allow it to be merged | ||
1467 | * with the others. | ||
1468 | */ | ||
1458 | ret = btrfs_drop_extents(trans, root, inode, file_pos, | 1469 | ret = btrfs_drop_extents(trans, root, inode, file_pos, |
1459 | file_pos + num_bytes, locked_end, | 1470 | file_pos + num_bytes, locked_end, |
1460 | file_pos, &hint); | 1471 | file_pos, &hint, 0); |
1461 | BUG_ON(ret); | 1472 | BUG_ON(ret); |
1462 | 1473 | ||
1463 | ins.objectid = inode->i_ino; | 1474 | ins.objectid = inode->i_ino; |
@@ -1485,7 +1496,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
1485 | btrfs_mark_buffer_dirty(leaf); | 1496 | btrfs_mark_buffer_dirty(leaf); |
1486 | 1497 | ||
1487 | inode_add_bytes(inode, num_bytes); | 1498 | inode_add_bytes(inode, num_bytes); |
1488 | btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); | ||
1489 | 1499 | ||
1490 | ins.objectid = disk_bytenr; | 1500 | ins.objectid = disk_bytenr; |
1491 | ins.offset = disk_num_bytes; | 1501 | ins.offset = disk_num_bytes; |
@@ -1596,6 +1606,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) | |||
1596 | ordered_extent->len, | 1606 | ordered_extent->len, |
1597 | compressed, 0, 0, | 1607 | compressed, 0, 0, |
1598 | BTRFS_FILE_EXTENT_REG); | 1608 | BTRFS_FILE_EXTENT_REG); |
1609 | unpin_extent_cache(&BTRFS_I(inode)->extent_tree, | ||
1610 | ordered_extent->file_offset, | ||
1611 | ordered_extent->len); | ||
1599 | BUG_ON(ret); | 1612 | BUG_ON(ret); |
1600 | } | 1613 | } |
1601 | unlock_extent(io_tree, ordered_extent->file_offset, | 1614 | unlock_extent(io_tree, ordered_extent->file_offset, |
@@ -2940,7 +2953,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) | |||
2940 | cur_offset, | 2953 | cur_offset, |
2941 | cur_offset + hole_size, | 2954 | cur_offset + hole_size, |
2942 | block_end, | 2955 | block_end, |
2943 | cur_offset, &hint_byte); | 2956 | cur_offset, &hint_byte, 1); |
2944 | if (err) | 2957 | if (err) |
2945 | break; | 2958 | break; |
2946 | err = btrfs_insert_file_extent(trans, root, | 2959 | err = btrfs_insert_file_extent(trans, root, |
@@ -5086,6 +5099,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans, | |||
5086 | 0, 0, 0, | 5099 | 0, 0, 0, |
5087 | BTRFS_FILE_EXTENT_PREALLOC); | 5100 | BTRFS_FILE_EXTENT_PREALLOC); |
5088 | BUG_ON(ret); | 5101 | BUG_ON(ret); |
5102 | btrfs_drop_extent_cache(inode, cur_offset, | ||
5103 | cur_offset + ins.offset -1, 0); | ||
5089 | num_bytes -= ins.offset; | 5104 | num_bytes -= ins.offset; |
5090 | cur_offset += ins.offset; | 5105 | cur_offset += ins.offset; |
5091 | alloc_hint = ins.objectid + ins.offset; | 5106 | alloc_hint = ins.objectid + ins.offset; |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9f4db848db10..e2d8e90259b0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -597,9 +597,8 @@ again: | |||
597 | clear_page_dirty_for_io(page); | 597 | clear_page_dirty_for_io(page); |
598 | 598 | ||
599 | btrfs_set_extent_delalloc(inode, page_start, page_end); | 599 | btrfs_set_extent_delalloc(inode, page_start, page_end); |
600 | |||
601 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
602 | set_page_dirty(page); | 600 | set_page_dirty(page); |
601 | unlock_extent(io_tree, page_start, page_end, GFP_NOFS); | ||
603 | unlock_page(page); | 602 | unlock_page(page); |
604 | page_cache_release(page); | 603 | page_cache_release(page); |
605 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); | 604 | balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); |
@@ -977,7 +976,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, | |||
977 | 976 | ||
978 | /* punch hole in destination first */ | 977 | /* punch hole in destination first */ |
979 | btrfs_drop_extents(trans, root, inode, off, off + len, | 978 | btrfs_drop_extents(trans, root, inode, off, off + len, |
980 | off + len, 0, &hint_byte); | 979 | off + len, 0, &hint_byte, 1); |
981 | 980 | ||
982 | /* clone data */ | 981 | /* clone data */ |
983 | key.objectid = src->i_ino; | 982 | key.objectid = src->i_ino; |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d91b0de7c502..8661a7381b39 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
534 | saved_nbytes = inode_get_bytes(inode); | 534 | saved_nbytes = inode_get_bytes(inode); |
535 | /* drop any overlapping extents */ | 535 | /* drop any overlapping extents */ |
536 | ret = btrfs_drop_extents(trans, root, inode, | 536 | ret = btrfs_drop_extents(trans, root, inode, |
537 | start, extent_end, extent_end, start, &alloc_hint); | 537 | start, extent_end, extent_end, start, &alloc_hint, 1); |
538 | BUG_ON(ret); | 538 | BUG_ON(ret); |
539 | 539 | ||
540 | if (found_type == BTRFS_FILE_EXTENT_REG || | 540 | if (found_type == BTRFS_FILE_EXTENT_REG || |