aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/file.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 20:17:13 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 20:17:13 -0500
commit27eb427bdc0960ad64b72da03e3596c801e7a9e9 (patch)
tree4170a265e99d455ca53d26a19e59330e3277fccd /fs/btrfs/file.c
parent713009809681e5a7871e96e6992692c805b4480b (diff)
parent2959a32a858a2c44bbbce83d19c158d54cc5998a (diff)
Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "We have a lot of subvolume quota improvements in here, along with big piles of cleanups from Dave Sterba and Anand Jain and others. Josef pitched in a batch of allocator fixes based on production use here at FB. We found that mount -o ssd_spread greatly improved our performance on hardware raid5/6, but it exposed some CPU bottlenecks in the allocator. These patches make a huge difference" * 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits) Btrfs: fix hole punching when using the no-holes feature Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state btrfs: Fix a data space underflow warning btrfs: qgroup: Fix a rebase bug which will cause qgroup double free btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans btrfs: clear PF_NOFREEZE in cleaner_kthread() btrfs: qgroup: Don't copy extent buffer to do qgroup rescan btrfs: add balance filters limits, stripes and usage to supported mask btrfs: extend balance filter usage to take minimum and maximum btrfs: add balance filter for stripes btrfs: extend balance filter limit to take minimum and maximum btrfs: fix use after free iterating extrefs btrfs: check unsupported filters in balance arguments Btrfs: fix regression running delayed references when using qgroups Btrfs: fix regression when running delayed references Btrfs: don't do extra bitmap search in one bit case Btrfs: keep track of largest extent in bitmaps Btrfs: don't keep trying to build clusters if we are fragmented Btrfs: cut down on loops through the allocator Btrfs: don't continue setting up space cache when enospc ...
Diffstat (limited to 'fs/btrfs/file.c')
-rw-r--r--fs/btrfs/file.c228
1 files changed, 162 insertions, 66 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c6f247ba81d..6bd5ce9d75f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
847 disk_bytenr, num_bytes, 0, 847 disk_bytenr, num_bytes, 0,
848 root->root_key.objectid, 848 root->root_key.objectid,
849 new_key.objectid, 849 new_key.objectid,
850 start - extent_offset, 1); 850 start - extent_offset);
851 BUG_ON(ret); /* -ENOMEM */ 851 BUG_ON(ret); /* -ENOMEM */
852 } 852 }
853 key.offset = start; 853 key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
925 disk_bytenr, num_bytes, 0, 925 disk_bytenr, num_bytes, 0,
926 root->root_key.objectid, 926 root->root_key.objectid,
927 key.objectid, key.offset - 927 key.objectid, key.offset -
928 extent_offset, 0); 928 extent_offset);
929 BUG_ON(ret); /* -ENOMEM */ 929 BUG_ON(ret); /* -ENOMEM */
930 inode_sub_bytes(inode, 930 inode_sub_bytes(inode,
931 extent_end - key.offset); 931 extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
1204 1204
1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1206 root->root_key.objectid, 1206 root->root_key.objectid,
1207 ino, orig_offset, 1); 1207 ino, orig_offset);
1208 BUG_ON(ret); /* -ENOMEM */ 1208 BUG_ON(ret); /* -ENOMEM */
1209 1209
1210 if (split == start) { 1210 if (split == start) {
@@ -1231,7 +1231,7 @@ again:
1231 del_nr++; 1231 del_nr++;
1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1233 0, root->root_key.objectid, 1233 0, root->root_key.objectid,
1234 ino, orig_offset, 0); 1234 ino, orig_offset);
1235 BUG_ON(ret); /* -ENOMEM */ 1235 BUG_ON(ret); /* -ENOMEM */
1236 } 1236 }
1237 other_start = 0; 1237 other_start = 0;
@@ -1248,7 +1248,7 @@ again:
1248 del_nr++; 1248 del_nr++;
1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1250 0, root->root_key.objectid, 1250 0, root->root_key.objectid,
1251 ino, orig_offset, 0); 1251 ino, orig_offset);
1252 BUG_ON(ret); /* -ENOMEM */ 1252 BUG_ON(ret); /* -ENOMEM */
1253 } 1253 }
1254 if (del_nr == 0) { 1254 if (del_nr == 0) {
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1469 u64 release_bytes = 0; 1469 u64 release_bytes = 0;
1470 u64 lockstart; 1470 u64 lockstart;
1471 u64 lockend; 1471 u64 lockend;
1472 unsigned long first_index;
1473 size_t num_written = 0; 1472 size_t num_written = 0;
1474 int nrptrs; 1473 int nrptrs;
1475 int ret = 0; 1474 int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1485 if (!pages) 1484 if (!pages)
1486 return -ENOMEM; 1485 return -ENOMEM;
1487 1486
1488 first_index = pos >> PAGE_CACHE_SHIFT;
1489
1490 while (iov_iter_count(i) > 0) { 1487 while (iov_iter_count(i) > 0) {
1491 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1488 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1492 size_t write_bytes = min(iov_iter_count(i), 1489 size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 } 1507 }
1511 1508
1512 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1509 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1513 ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); 1510
1514 if (ret == -ENOSPC && 1511 if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1515 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1512 BTRFS_INODE_PREALLOC)) {
1516 BTRFS_INODE_PREALLOC))) {
1517 ret = check_can_nocow(inode, pos, &write_bytes); 1513 ret = check_can_nocow(inode, pos, &write_bytes);
1514 if (ret < 0)
1515 break;
1518 if (ret > 0) { 1516 if (ret > 0) {
1517 /*
1518 * For nodata cow case, no need to reserve
1519 * data space.
1520 */
1519 only_release_metadata = true; 1521 only_release_metadata = true;
1520 /* 1522 /*
1521 * our prealloc extent may be smaller than 1523 * our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1524 num_pages = DIV_ROUND_UP(write_bytes + offset, 1526 num_pages = DIV_ROUND_UP(write_bytes + offset,
1525 PAGE_CACHE_SIZE); 1527 PAGE_CACHE_SIZE);
1526 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1528 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1527 ret = 0; 1529 goto reserve_metadata;
1528 } else {
1529 ret = -ENOSPC;
1530 } 1530 }
1531 } 1531 }
1532 1532 ret = btrfs_check_data_free_space(inode, pos, write_bytes);
1533 if (ret) 1533 if (ret < 0)
1534 break; 1534 break;
1535 1535
1536reserve_metadata:
1536 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); 1537 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1537 if (ret) { 1538 if (ret) {
1538 if (!only_release_metadata) 1539 if (!only_release_metadata)
1539 btrfs_free_reserved_data_space(inode, 1540 btrfs_free_reserved_data_space(inode, pos,
1540 reserve_bytes); 1541 write_bytes);
1541 else 1542 else
1542 btrfs_end_write_no_snapshoting(root); 1543 btrfs_end_write_no_snapshoting(root);
1543 break; 1544 break;
@@ -1603,12 +1604,17 @@ again:
1603 BTRFS_I(inode)->outstanding_extents++; 1604 BTRFS_I(inode)->outstanding_extents++;
1604 spin_unlock(&BTRFS_I(inode)->lock); 1605 spin_unlock(&BTRFS_I(inode)->lock);
1605 } 1606 }
1606 if (only_release_metadata) 1607 if (only_release_metadata) {
1607 btrfs_delalloc_release_metadata(inode, 1608 btrfs_delalloc_release_metadata(inode,
1608 release_bytes); 1609 release_bytes);
1609 else 1610 } else {
1610 btrfs_delalloc_release_space(inode, 1611 u64 __pos;
1612
1613 __pos = round_down(pos, root->sectorsize) +
1614 (dirty_pages << PAGE_CACHE_SHIFT);
1615 btrfs_delalloc_release_space(inode, __pos,
1611 release_bytes); 1616 release_bytes);
1617 }
1612 } 1618 }
1613 1619
1614 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1620 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1666,7 @@ again:
1660 btrfs_end_write_no_snapshoting(root); 1666 btrfs_end_write_no_snapshoting(root);
1661 btrfs_delalloc_release_metadata(inode, release_bytes); 1667 btrfs_delalloc_release_metadata(inode, release_bytes);
1662 } else { 1668 } else {
1663 btrfs_delalloc_release_space(inode, release_bytes); 1669 btrfs_delalloc_release_space(inode, pos, release_bytes);
1664 } 1670 }
1665 } 1671 }
1666 1672
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2266 u64 drop_end; 2272 u64 drop_end;
2267 int ret = 0; 2273 int ret = 0;
2268 int err = 0; 2274 int err = 0;
2269 int rsv_count; 2275 unsigned int rsv_count;
2270 bool same_page; 2276 bool same_page;
2271 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2277 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2272 u64 ino_size; 2278 u64 ino_size;
@@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2488 2494
2489 trans->block_rsv = &root->fs_info->trans_block_rsv; 2495 trans->block_rsv = &root->fs_info->trans_block_rsv;
2490 /* 2496 /*
2497 * If we are using the NO_HOLES feature we might have had already an
2498 * hole that overlaps a part of the region [lockstart, lockend] and
2499 * ends at (or beyond) lockend. Since we have no file extent items to
2500 * represent holes, drop_end can be less than lockend and so we must
2501 * make sure we have an extent map representing the existing hole (the
2502 * call to __btrfs_drop_extents() might have dropped the existing extent
2503 * map representing the existing hole), otherwise the fast fsync path
2504 * will not record the existence of the hole region
2505 * [existing_hole_start, lockend].
2506 */
2507 if (drop_end <= lockend)
2508 drop_end = lockend + 1;
2509 /*
2491 * Don't insert file hole extent item if it's for a range beyond eof 2510 * Don't insert file hole extent item if it's for a range beyond eof
2492 * (because it's useless) or if it represents a 0 bytes range (when 2511 * (because it's useless) or if it represents a 0 bytes range (when
2493 * cur_offset == drop_end). 2512 * cur_offset == drop_end).
@@ -2541,17 +2560,61 @@ out_only_mutex:
2541 return err; 2560 return err;
2542} 2561}
2543 2562
2563/* Helper structure to record which range is already reserved */
2564struct falloc_range {
2565 struct list_head list;
2566 u64 start;
2567 u64 len;
2568};
2569
2570/*
2571 * Helper function to add falloc range
2572 *
2573 * Caller should have locked the larger range of extent containing
2574 * [start, len)
2575 */
2576static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2577{
2578 struct falloc_range *prev = NULL;
2579 struct falloc_range *range = NULL;
2580
2581 if (list_empty(head))
2582 goto insert;
2583
2584 /*
2585 * As fallocate iterate by bytenr order, we only need to check
2586 * the last range.
2587 */
2588 prev = list_entry(head->prev, struct falloc_range, list);
2589 if (prev->start + prev->len == start) {
2590 prev->len += len;
2591 return 0;
2592 }
2593insert:
2594 range = kmalloc(sizeof(*range), GFP_NOFS);
2595 if (!range)
2596 return -ENOMEM;
2597 range->start = start;
2598 range->len = len;
2599 list_add_tail(&range->list, head);
2600 return 0;
2601}
2602
2544static long btrfs_fallocate(struct file *file, int mode, 2603static long btrfs_fallocate(struct file *file, int mode,
2545 loff_t offset, loff_t len) 2604 loff_t offset, loff_t len)
2546{ 2605{
2547 struct inode *inode = file_inode(file); 2606 struct inode *inode = file_inode(file);
2548 struct extent_state *cached_state = NULL; 2607 struct extent_state *cached_state = NULL;
2608 struct falloc_range *range;
2609 struct falloc_range *tmp;
2610 struct list_head reserve_list;
2549 u64 cur_offset; 2611 u64 cur_offset;
2550 u64 last_byte; 2612 u64 last_byte;
2551 u64 alloc_start; 2613 u64 alloc_start;
2552 u64 alloc_end; 2614 u64 alloc_end;
2553 u64 alloc_hint = 0; 2615 u64 alloc_hint = 0;
2554 u64 locked_end; 2616 u64 locked_end;
2617 u64 actual_end = 0;
2555 struct extent_map *em; 2618 struct extent_map *em;
2556 int blocksize = BTRFS_I(inode)->root->sectorsize; 2619 int blocksize = BTRFS_I(inode)->root->sectorsize;
2557 int ret; 2620 int ret;
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
2567 return btrfs_punch_hole(inode, offset, len); 2630 return btrfs_punch_hole(inode, offset, len);
2568 2631
2569 /* 2632 /*
2570 * Make sure we have enough space before we do the 2633 * Only trigger disk allocation, don't trigger qgroup reserve
2571 * allocation. 2634 *
2635 * For qgroup space, it will be checked later.
2572 */ 2636 */
2573 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); 2637 ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
2574 if (ret) 2638 if (ret < 0)
2575 return ret; 2639 return ret;
2576 2640
2577 mutex_lock(&inode->i_mutex); 2641 mutex_lock(&inode->i_mutex);
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
2579 if (ret) 2643 if (ret)
2580 goto out; 2644 goto out;
2581 2645
2646 /*
2647 * TODO: Move these two operations after we have checked
2648 * accurate reserved space, or fallocate can still fail but
2649 * with page truncated or size expanded.
2650 *
2651 * But that's a minor problem and won't do much harm BTW.
2652 */
2582 if (alloc_start > inode->i_size) { 2653 if (alloc_start > inode->i_size) {
2583 ret = btrfs_cont_expand(inode, i_size_read(inode), 2654 ret = btrfs_cont_expand(inode, i_size_read(inode),
2584 alloc_start); 2655 alloc_start);
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
2637 } 2708 }
2638 } 2709 }
2639 2710
2711 /* First, check if we exceed the qgroup limit */
2712 INIT_LIST_HEAD(&reserve_list);
2640 cur_offset = alloc_start; 2713 cur_offset = alloc_start;
2641 while (1) { 2714 while (1) {
2642 u64 actual_end;
2643
2644 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2715 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2645 alloc_end - cur_offset, 0); 2716 alloc_end - cur_offset, 0);
2646 if (IS_ERR_OR_NULL(em)) { 2717 if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
2653 last_byte = min(extent_map_end(em), alloc_end); 2724 last_byte = min(extent_map_end(em), alloc_end);
2654 actual_end = min_t(u64, extent_map_end(em), offset + len); 2725 actual_end = min_t(u64, extent_map_end(em), offset + len);
2655 last_byte = ALIGN(last_byte, blocksize); 2726 last_byte = ALIGN(last_byte, blocksize);
2656
2657 if (em->block_start == EXTENT_MAP_HOLE || 2727 if (em->block_start == EXTENT_MAP_HOLE ||
2658 (cur_offset >= inode->i_size && 2728 (cur_offset >= inode->i_size &&
2659 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 2729 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2660 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 2730 ret = add_falloc_range(&reserve_list, cur_offset,
2661 last_byte - cur_offset, 2731 last_byte - cur_offset);
2662 1 << inode->i_blkbits, 2732 if (ret < 0) {
2663 offset + len, 2733 free_extent_map(em);
2664 &alloc_hint); 2734 break;
2665 } else if (actual_end > inode->i_size &&
2666 !(mode & FALLOC_FL_KEEP_SIZE)) {
2667 struct btrfs_trans_handle *trans;
2668 struct btrfs_root *root = BTRFS_I(inode)->root;
2669
2670 /*
2671 * We didn't need to allocate any more space, but we
2672 * still extended the size of the file so we need to
2673 * update i_size and the inode item.
2674 */
2675 trans = btrfs_start_transaction(root, 1);
2676 if (IS_ERR(trans)) {
2677 ret = PTR_ERR(trans);
2678 } else {
2679 inode->i_ctime = CURRENT_TIME;
2680 i_size_write(inode, actual_end);
2681 btrfs_ordered_update_i_size(inode, actual_end,
2682 NULL);
2683 ret = btrfs_update_inode(trans, root, inode);
2684 if (ret)
2685 btrfs_end_transaction(trans, root);
2686 else
2687 ret = btrfs_end_transaction(trans,
2688 root);
2689 } 2735 }
2736 ret = btrfs_qgroup_reserve_data(inode, cur_offset,
2737 last_byte - cur_offset);
2738 if (ret < 0)
2739 break;
2690 } 2740 }
2691 free_extent_map(em); 2741 free_extent_map(em);
2692 if (ret < 0)
2693 break;
2694
2695 cur_offset = last_byte; 2742 cur_offset = last_byte;
2696 if (cur_offset >= alloc_end) { 2743 if (cur_offset >= alloc_end)
2697 ret = 0;
2698 break; 2744 break;
2745 }
2746
2747 /*
2748 * If ret is still 0, means we're OK to fallocate.
2749 * Or just cleanup the list and exit.
2750 */
2751 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
2752 if (!ret)
2753 ret = btrfs_prealloc_file_range(inode, mode,
2754 range->start,
2755 range->len, 1 << inode->i_blkbits,
2756 offset + len, &alloc_hint);
2757 list_del(&range->list);
2758 kfree(range);
2759 }
2760 if (ret < 0)
2761 goto out_unlock;
2762
2763 if (actual_end > inode->i_size &&
2764 !(mode & FALLOC_FL_KEEP_SIZE)) {
2765 struct btrfs_trans_handle *trans;
2766 struct btrfs_root *root = BTRFS_I(inode)->root;
2767
2768 /*
2769 * We didn't need to allocate any more space, but we
2770 * still extended the size of the file so we need to
2771 * update i_size and the inode item.
2772 */
2773 trans = btrfs_start_transaction(root, 1);
2774 if (IS_ERR(trans)) {
2775 ret = PTR_ERR(trans);
2776 } else {
2777 inode->i_ctime = CURRENT_TIME;
2778 i_size_write(inode, actual_end);
2779 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2780 ret = btrfs_update_inode(trans, root, inode);
2781 if (ret)
2782 btrfs_end_transaction(trans, root);
2783 else
2784 ret = btrfs_end_transaction(trans, root);
2699 } 2785 }
2700 } 2786 }
2787out_unlock:
2701 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2788 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2702 &cached_state, GFP_NOFS); 2789 &cached_state, GFP_NOFS);
2703out: 2790out:
2791 /*
2792 * As we waited the extent range, the data_rsv_map must be empty
2793 * in the range, as written data range will be released from it.
2794 * And for prealloacted extent, it will also be released when
2795 * its metadata is written.
2796 * So this is completely used as cleanup.
2797 */
2798 btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
2704 mutex_unlock(&inode->i_mutex); 2799 mutex_unlock(&inode->i_mutex);
2705 /* Let go of our reservation. */ 2800 /* Let go of our reservation. */
2706 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2801 btrfs_free_reserved_data_space(inode, alloc_start,
2802 alloc_end - alloc_start);
2707 return ret; 2803 return ret;
2708} 2804}
2709 2805