aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c189
1 files changed, 122 insertions, 67 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 404704d26822..f3abecc2d14c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
83 83
84static void btrfs_truncate(struct inode *inode); 84static void btrfs_truncate(struct inode *inode);
85 85
86/*
87 * a very lame attempt at stopping writes when the FS is 85% full. There
88 * are countless ways this is incorrect, but it is better than nothing.
89 */
86int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, 90int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
87 int for_del) 91 int for_del)
88{ 92{
@@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
108 return ret; 112 return ret;
109} 113}
110 114
115/*
116 * when extent_io.c finds a delayed allocation range in the file,
117 * the call backs end up in this code. The basic idea is to
118 * allocate extents on disk for the range, and create ordered data structs
119 * in ram to track those extents.
120 */
111static int cow_file_range(struct inode *inode, u64 start, u64 end) 121static int cow_file_range(struct inode *inode, u64 start, u64 end)
112{ 122{
113 struct btrfs_root *root = BTRFS_I(inode)->root; 123 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -185,6 +195,13 @@ out:
185 return ret; 195 return ret;
186} 196}
187 197
198/*
199 * when nowcow writeback call back. This checks for snapshots or COW copies
200 * of the extents that exist in the file, and COWs the file as required.
201 *
202 * If no cow copies or snapshots exist, we write directly to the existing
203 * blocks on disk
204 */
188static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) 205static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
189{ 206{
190 u64 extent_start; 207 u64 extent_start;
@@ -291,6 +308,9 @@ out:
291 return err; 308 return err;
292} 309}
293 310
311/*
312 * extent_io.c call back to do delayed allocation processing
313 */
294static int run_delalloc_range(struct inode *inode, u64 start, u64 end) 314static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
295{ 315{
296 struct btrfs_root *root = BTRFS_I(inode)->root; 316 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
305 return ret; 325 return ret;
306} 326}
307 327
328/*
329 * extent_io.c set_bit_hook, used to track delayed allocation
330 * bytes in this file, and to maintain the list of inodes that
331 * have pending delalloc work to be done.
332 */
308int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 333int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
309 unsigned long old, unsigned long bits) 334 unsigned long old, unsigned long bits)
310{ 335{
@@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
323 return 0; 348 return 0;
324} 349}
325 350
351/*
352 * extent_io.c clear_bit_hook, see set_bit_hook for why
353 */
326int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, 354int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
327 unsigned long old, unsigned long bits) 355 unsigned long old, unsigned long bits)
328{ 356{
@@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
349 return 0; 377 return 0;
350} 378}
351 379
380/*
381 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
382 * we don't create bios that span stripes or chunks
383 */
352int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 384int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
353 size_t size, struct bio *bio) 385 size_t size, struct bio *bio)
354{ 386{
@@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
371 return 0; 403 return 0;
372} 404}
373 405
406/*
407 * in order to insert checksums into the metadata in large chunks,
408 * we wait until bio submission time. All the pages in the bio are
409 * checksummed and sums are attached onto the ordered extent record.
410 *
411 * At IO completion time the cums attached on the ordered extent record
412 * are inserted into the btree
413 */
374int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 414int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
375 int mirror_num) 415 int mirror_num)
376{ 416{
@@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
383 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 423 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
384} 424}
385 425
426/*
427 * extent_io.c submission hook. This does the right thing for csum calculation on write,
428 * or reading the csums from the tree before a read
429 */
386int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 430int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
387 int mirror_num) 431 int mirror_num)
388{ 432{
@@ -408,6 +452,10 @@ mapit:
408 return btrfs_map_bio(root, rw, bio, mirror_num, 0); 452 return btrfs_map_bio(root, rw, bio, mirror_num, 0);
409} 453}
410 454
455/*
456 * given a list of ordered sums record them in the inode. This happens
457 * at IO completion time based on sums calculated at bio submission time.
458 */
411static noinline int add_pending_csums(struct btrfs_trans_handle *trans, 459static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
412 struct inode *inode, u64 file_offset, 460 struct inode *inode, u64 file_offset,
413 struct list_head *list) 461 struct list_head *list)
@@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
430 GFP_NOFS); 478 GFP_NOFS);
431} 479}
432 480
481/* see btrfs_writepage_start_hook for details on why this is required */
433struct btrfs_writepage_fixup { 482struct btrfs_writepage_fixup {
434 struct page *page; 483 struct page *page;
435 struct btrfs_work work; 484 struct btrfs_work work;
436}; 485};
437 486
438/* see btrfs_writepage_start_hook for details on why this is required */
439void btrfs_writepage_fixup_worker(struct btrfs_work *work) 487void btrfs_writepage_fixup_worker(struct btrfs_work *work)
440{ 488{
441 struct btrfs_writepage_fixup *fixup; 489 struct btrfs_writepage_fixup *fixup;
@@ -522,6 +570,10 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
522 return -EAGAIN; 570 return -EAGAIN;
523} 571}
524 572
573/* as ordered data IO finishes, this gets called so we can finish
574 * an ordered extent if the range of bytes in the file it covers are
575 * fully written.
576 */
525static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 577static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
526{ 578{
527 struct btrfs_root *root = BTRFS_I(inode)->root; 579 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -631,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
631 return btrfs_finish_ordered_io(page->mapping->host, start, end); 683 return btrfs_finish_ordered_io(page->mapping->host, start, end);
632} 684}
633 685
686/*
687 * When IO fails, either with EIO or csum verification fails, we
688 * try other mirrors that might have a good copy of the data. This
689 * io_failure_record is used to record state as we go through all the
690 * mirrors. If another mirror has good data, the page is set up to date
691 * and things continue. If a good mirror can't be found, the original
692 * bio end_io callback is called to indicate things have failed.
693 */
634struct io_failure_record { 694struct io_failure_record {
635 struct page *page; 695 struct page *page;
636 u64 start; 696 u64 start;
@@ -725,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
725 return 0; 785 return 0;
726} 786}
727 787
788/*
789 * each time an IO finishes, we do a fast check in the IO failure tree
790 * to see if we need to process or clean up an io_failure_record
791 */
728int btrfs_clean_io_failures(struct inode *inode, u64 start) 792int btrfs_clean_io_failures(struct inode *inode, u64 start)
729{ 793{
730 u64 private; 794 u64 private;
@@ -753,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
753 return 0; 817 return 0;
754} 818}
755 819
820/*
821 * when reads are done, we need to check csums to verify the data is correct
822 * if there's a match, we allow the bio to finish. If not, we go through
823 * the io_failure_record routines to find good copies
824 */
756int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 825int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
757 struct extent_state *state) 826 struct extent_state *state)
758{ 827{
@@ -990,6 +1059,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
990 btrfs_free_path(path); 1059 btrfs_free_path(path);
991} 1060}
992 1061
1062/*
1063 * read an inode from the btree into the in-memory inode
1064 */
993void btrfs_read_locked_inode(struct inode *inode) 1065void btrfs_read_locked_inode(struct inode *inode)
994{ 1066{
995 struct btrfs_path *path; 1067 struct btrfs_path *path;
@@ -1083,6 +1155,9 @@ make_bad:
1083 make_bad_inode(inode); 1155 make_bad_inode(inode);
1084} 1156}
1085 1157
1158/*
1159 * given a leaf and an inode, copy the inode fields into the leaf
1160 */
1086static void fill_inode_item(struct btrfs_trans_handle *trans, 1161static void fill_inode_item(struct btrfs_trans_handle *trans,
1087 struct extent_buffer *leaf, 1162 struct extent_buffer *leaf,
1088 struct btrfs_inode_item *item, 1163 struct btrfs_inode_item *item,
@@ -1118,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
1118 BTRFS_I(inode)->block_group->key.objectid); 1193 BTRFS_I(inode)->block_group->key.objectid);
1119} 1194}
1120 1195
1196/*
1197 * copy everything in the in-memory inode into the btree.
1198 */
1121int noinline btrfs_update_inode(struct btrfs_trans_handle *trans, 1199int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
1122 struct btrfs_root *root, 1200 struct btrfs_root *root,
1123 struct inode *inode) 1201 struct inode *inode)
@@ -1151,6 +1229,11 @@ failed:
1151} 1229}
1152 1230
1153 1231
1232/*
1233 * unlink helper that gets used here in inode.c and in the tree logging
1234 * recovery code. It remove a link in a directory with a given name, and
1235 * also drops the back refs in the inode to the directory
1236 */
1154int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 1237int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
1155 struct btrfs_root *root, 1238 struct btrfs_root *root,
1156 struct inode *dir, struct inode *inode, 1239 struct inode *dir, struct inode *inode,
@@ -1309,7 +1392,7 @@ fail:
1309/* 1392/*
1310 * this can truncate away extent items, csum items and directory items. 1393 * this can truncate away extent items, csum items and directory items.
1311 * It starts at a high offset and removes keys until it can't find 1394 * It starts at a high offset and removes keys until it can't find
1312 * any higher than i_size. 1395 * any higher than new_size
1313 * 1396 *
1314 * csum items that cross the new i_size are truncated to the new size 1397 * csum items that cross the new i_size are truncated to the new size
1315 * as well. 1398 * as well.
@@ -2123,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)
2123 btrfs_end_transaction(trans, root); 2206 btrfs_end_transaction(trans, root);
2124} 2207}
2125 2208
2209/*
2210 * find the highest existing sequence number in a directory
2211 * and then set the in-memory index_cnt variable to reflect
2212 * free sequence numbers
2213 */
2126static int btrfs_set_inode_index_count(struct inode *inode) 2214static int btrfs_set_inode_index_count(struct inode *inode)
2127{ 2215{
2128 struct btrfs_root *root = BTRFS_I(inode)->root; 2216 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2175,6 +2263,10 @@ out:
2175 return ret; 2263 return ret;
2176} 2264}
2177 2265
2266/*
2267 * helper to find a free sequence number in a given directory. This current
2268 * code is very simple, later versions will do smarter things in the btree
2269 */
2178static int btrfs_set_inode_index(struct inode *dir, struct inode *inode, 2270static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
2179 u64 *index) 2271 u64 *index)
2180{ 2272{
@@ -2305,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)
2305 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; 2397 return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
2306} 2398}
2307 2399
2400/*
2401 * utility function to add 'inode' into 'parent_inode' with
2402 * a give name and a given sequence number.
2403 * if 'add_backref' is true, also insert a backref from the
2404 * inode to the parent directory.
2405 */
2308int btrfs_add_link(struct btrfs_trans_handle *trans, 2406int btrfs_add_link(struct btrfs_trans_handle *trans,
2309 struct inode *parent_inode, struct inode *inode, 2407 struct inode *parent_inode, struct inode *inode,
2310 const char *name, int name_len, int add_backref, u64 index) 2408 const char *name, int name_len, int add_backref, u64 index)
@@ -2611,6 +2709,10 @@ out_unlock:
2611 return err; 2709 return err;
2612} 2710}
2613 2711
2712/* helper for btfs_get_extent. Given an existing extent in the tree,
2713 * and an extent that you want to insert, deal with overlap and insert
2714 * the new extent into the tree.
2715 */
2614static int merge_extent_mapping(struct extent_map_tree *em_tree, 2716static int merge_extent_mapping(struct extent_map_tree *em_tree,
2615 struct extent_map *existing, 2717 struct extent_map *existing,
2616 struct extent_map *em, 2718 struct extent_map *em,
@@ -2627,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
2627 return add_extent_mapping(em_tree, em); 2729 return add_extent_mapping(em_tree, em);
2628} 2730}
2629 2731
2732/*
2733 * a bit scary, this does extent mapping from logical file offset to the disk.
2734 * the ugly parts come from merging extents from the disk with the
2735 * in-ram representation. This gets more complex because of the data=ordered code,
2736 * where the in-ram extents might be locked pending data=ordered completion.
2737 *
2738 * This also copies inline extents directly into the page.
2739 */
2630struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2740struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2631 size_t pg_offset, u64 start, u64 len, 2741 size_t pg_offset, u64 start, u64 len,
2632 int create) 2742 int create)
@@ -2869,76 +2979,11 @@ out:
2869 return em; 2979 return em;
2870} 2980}
2871 2981
2872#if 0 /* waiting for O_DIRECT reads */
2873static int btrfs_get_block(struct inode *inode, sector_t iblock,
2874 struct buffer_head *bh_result, int create)
2875{
2876 struct extent_map *em;
2877 u64 start = (u64)iblock << inode->i_blkbits;
2878 struct btrfs_multi_bio *multi = NULL;
2879 struct btrfs_root *root = BTRFS_I(inode)->root;
2880 u64 len;
2881 u64 logical;
2882 u64 map_length;
2883 int ret = 0;
2884
2885 em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
2886
2887 if (!em || IS_ERR(em))
2888 goto out;
2889
2890 if (em->start > start || em->start + em->len <= start) {
2891 goto out;
2892 }
2893
2894 if (em->block_start == EXTENT_MAP_INLINE) {
2895 ret = -EINVAL;
2896 goto out;
2897 }
2898
2899 len = em->start + em->len - start;
2900 len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
2901
2902 if (em->block_start == EXTENT_MAP_HOLE ||
2903 em->block_start == EXTENT_MAP_DELALLOC) {
2904 bh_result->b_size = len;
2905 goto out;
2906 }
2907
2908 logical = start - em->start;
2909 logical = em->block_start + logical;
2910
2911 map_length = len;
2912 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
2913 logical, &map_length, &multi, 0);
2914 BUG_ON(ret);
2915 bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
2916 bh_result->b_size = min(map_length, len);
2917
2918 bh_result->b_bdev = multi->stripes[0].dev->bdev;
2919 set_buffer_mapped(bh_result);
2920 kfree(multi);
2921out:
2922 free_extent_map(em);
2923 return ret;
2924}
2925#endif
2926
2927static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 2982static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
2928 const struct iovec *iov, loff_t offset, 2983 const struct iovec *iov, loff_t offset,
2929 unsigned long nr_segs) 2984 unsigned long nr_segs)
2930{ 2985{
2931 return -EINVAL; 2986 return -EINVAL;
2932#if 0
2933 struct file *file = iocb->ki_filp;
2934 struct inode *inode = file->f_mapping->host;
2935
2936 if (rw == WRITE)
2937 return -EINVAL;
2938
2939 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
2940 offset, nr_segs, btrfs_get_block, NULL);
2941#endif
2942} 2987}
2943 2988
2944static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) 2989static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -3202,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
3202 } 3247 }
3203} 3248}
3204 3249
3250/*
3251 * create a new subvolume directory/inode (helper for the ioctl).
3252 */
3205int btrfs_create_subvol_root(struct btrfs_root *new_root, 3253int btrfs_create_subvol_root(struct btrfs_root *new_root,
3206 struct btrfs_trans_handle *trans, u64 new_dirid, 3254 struct btrfs_trans_handle *trans, u64 new_dirid,
3207 struct btrfs_block_group_cache *block_group) 3255 struct btrfs_block_group_cache *block_group)
@@ -3223,6 +3271,9 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
3223 return btrfs_update_inode(trans, new_root, inode); 3271 return btrfs_update_inode(trans, new_root, inode);
3224} 3272}
3225 3273
3274/* helper function for file defrag and space balancing. This
3275 * forces readahead on a given range of bytes in an inode
3276 */
3226unsigned long btrfs_force_ra(struct address_space *mapping, 3277unsigned long btrfs_force_ra(struct address_space *mapping,
3227 struct file_ra_state *ra, struct file *file, 3278 struct file_ra_state *ra, struct file *file,
3228 pgoff_t offset, pgoff_t last_index) 3279 pgoff_t offset, pgoff_t last_index)
@@ -3424,6 +3475,10 @@ out_unlock:
3424 return ret; 3475 return ret;
3425} 3476}
3426 3477
3478/*
3479 * some fairly slow code that needs optimization. This walks the list
3480 * of all the inodes with pending delalloc and forces them to disk.
3481 */
3427int btrfs_start_delalloc_inodes(struct btrfs_root *root) 3482int btrfs_start_delalloc_inodes(struct btrfs_root *root)
3428{ 3483{
3429 struct list_head *head = &root->fs_info->delalloc_inodes; 3484 struct list_head *head = &root->fs_info->delalloc_inodes;