aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTao Ma <tao.ma@oracle.com>2009-08-24 20:05:12 -0400
committerJoel Becker <joel.becker@oracle.com>2009-09-22 23:09:36 -0400
commit6f70fa519976a379d72781d927cf8e5f5b05ec86 (patch)
tree563cdeb116f2016c3c4b7a627a51f0a85eec1566
parentbcbbb24a6a5c5b3e7b8e5284e0bfa23f45c32377 (diff)
ocfs2: Add CoW support.
This patch try CoW support for a refcounted record. the whole process will be: 1. Calculate how many clusters we need to CoW and where we start. Extents that are not completely encompassed by the write will be broken on 1MB boundaries. 2. Do CoW for the clusters with the help of page cache. 3. Change the b-tree structure with the new allocated clusters. Signed-off-by: Tao Ma <tao.ma@oracle.com>
-rw-r--r--fs/ocfs2/alloc.c25
-rw-r--r--fs/ocfs2/alloc.h5
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/refcounttree.c814
-rw-r--r--fs/ocfs2/refcounttree.h2
6 files changed, 841 insertions, 11 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 03438a677933..b8fc95d10630 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6998,9 +6998,9 @@ static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6998 return 0; 6998 return 0;
6999} 6999}
7000 7000
7001static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 7001void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
7002 unsigned int from, unsigned int to, 7002 unsigned int from, unsigned int to,
7003 struct page *page, int zero, u64 *phys) 7003 struct page *page, int zero, u64 *phys)
7004{ 7004{
7005 int ret, partial = 0; 7005 int ret, partial = 0;
7006 7006
@@ -7068,20 +7068,16 @@ out:
7068 ocfs2_unlock_and_free_pages(pages, numpages); 7068 ocfs2_unlock_and_free_pages(pages, numpages);
7069} 7069}
7070 7070
7071static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 7071int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
7072 struct page **pages, int *num) 7072 struct page **pages, int *num)
7073{ 7073{
7074 int numpages, ret = 0; 7074 int numpages, ret = 0;
7075 struct super_block *sb = inode->i_sb;
7076 struct address_space *mapping = inode->i_mapping; 7075 struct address_space *mapping = inode->i_mapping;
7077 unsigned long index; 7076 unsigned long index;
7078 loff_t last_page_bytes; 7077 loff_t last_page_bytes;
7079 7078
7080 BUG_ON(start > end); 7079 BUG_ON(start > end);
7081 7080
7082 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7083 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7084
7085 numpages = 0; 7081 numpages = 0;
7086 last_page_bytes = PAGE_ALIGN(end); 7082 last_page_bytes = PAGE_ALIGN(end);
7087 index = start >> PAGE_CACHE_SHIFT; 7083 index = start >> PAGE_CACHE_SHIFT;
@@ -7109,6 +7105,17 @@ out:
7109 return ret; 7105 return ret;
7110} 7106}
7111 7107
7108static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
7109 struct page **pages, int *num)
7110{
7111 struct super_block *sb = inode->i_sb;
7112
7113 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
7114 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
7115
7116 return ocfs2_grab_pages(inode, start, end, pages, num);
7117}
7118
7112/* 7119/*
7113 * Zero the area past i_size but still within an allocated 7120 * Zero the area past i_size but still within an allocated
7114 * cluster. This avoids exposing nonzero data on subsequent file 7121 * cluster. This avoids exposing nonzero data on subsequent file
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 19d5b88a93df..9c122d574464 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -271,6 +271,11 @@ static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
271 return !rec->e_leaf_clusters; 271 return !rec->e_leaf_clusters;
272} 272}
273 273
274int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end,
275 struct page **pages, int *num);
276void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
277 unsigned int from, unsigned int to,
278 struct page *page, int zero, u64 *phys);
274/* 279/*
275 * Structures which describe a path through a btree, and functions to 280 * Structures which describe a path through a btree, and functions to
276 * manipulate them. 281 * manipulate them.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 15c594dfd951..fdad075fed61 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -126,8 +126,8 @@ bail:
126 return err; 126 return err;
127} 127}
128 128
129static int ocfs2_get_block(struct inode *inode, sector_t iblock, 129int ocfs2_get_block(struct inode *inode, sector_t iblock,
130 struct buffer_head *bh_result, int create) 130 struct buffer_head *bh_result, int create)
131{ 131{
132 int err = 0; 132 int err = 0;
133 unsigned int ext_flags; 133 unsigned int ext_flags;
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 503e49232e11..c48e93ffc513 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -57,6 +57,8 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
57 struct buffer_head *di_bh); 57 struct buffer_head *di_bh);
58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); 58int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size);
59 59
60int ocfs2_get_block(struct inode *inode, sector_t iblock,
61 struct buffer_head *bh_result, int create);
60/* all ocfs2_dio_end_io()'s fault */ 62/* all ocfs2_dio_end_io()'s fault */
61#define ocfs2_iocb_is_rw_locked(iocb) \ 63#define ocfs2_iocb_is_rw_locked(iocb) \
62 test_bit(0, (unsigned long *)&iocb->private) 64 test_bit(0, (unsigned long *)&iocb->private)
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e72dbdd3b6e8..4e7df8b8fd4f 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -31,6 +31,27 @@
31#include "sysfile.h" 31#include "sysfile.h"
32#include "dlmglue.h" 32#include "dlmglue.h"
33#include "extent_map.h" 33#include "extent_map.h"
34#include "aops.h"
35
36#include <linux/bio.h>
37#include <linux/blkdev.h>
38#include <linux/gfp.h>
39#include <linux/slab.h>
40#include <linux/writeback.h>
41#include <linux/pagevec.h>
42#include <linux/swap.h>
43
44struct ocfs2_cow_context {
45 struct inode *inode;
46 u32 cow_start;
47 u32 cow_len;
48 struct ocfs2_extent_tree di_et;
49 struct ocfs2_caching_info *ref_ci;
50 struct buffer_head *ref_root_bh;
51 struct ocfs2_alloc_context *meta_ac;
52 struct ocfs2_alloc_context *data_ac;
53 struct ocfs2_cached_dealloc_ctxt dealloc;
54};
34 55
35static inline struct ocfs2_refcount_tree * 56static inline struct ocfs2_refcount_tree *
36cache_info_to_refcount(struct ocfs2_caching_info *ci) 57cache_info_to_refcount(struct ocfs2_caching_info *ci)
@@ -2404,3 +2425,796 @@ out:
2404 brelse(ref_root_bh); 2425 brelse(ref_root_bh);
2405 return ret; 2426 return ret;
2406} 2427}
2428
2429#define MAX_CONTIG_BYTES 1048576
2430
2431static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
2432{
2433 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
2434}
2435
2436static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
2437{
2438 return ~(ocfs2_cow_contig_clusters(sb) - 1);
2439}
2440
2441/*
2442 * Given an extent that starts at 'start' and an I/O that starts at 'cpos',
2443 * find an offset (start + (n * contig_clusters)) that is closest to cpos
2444 * while still being less than or equal to it.
2445 *
2446 * The goal is to break the extent at a multiple of contig_clusters.
2447 */
2448static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
2449 unsigned int start,
2450 unsigned int cpos)
2451{
2452 BUG_ON(start > cpos);
2453
2454 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
2455}
2456
2457/*
2458 * Given a cluster count of len, pad it out so that it is a multiple
2459 * of contig_clusters.
2460 */
2461static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
2462 unsigned int len)
2463{
2464 unsigned int padded =
2465 (len + (ocfs2_cow_contig_clusters(sb) - 1)) &
2466 ocfs2_cow_contig_mask(sb);
2467
2468 /* Did we wrap? */
2469 if (padded < len)
2470 padded = UINT_MAX;
2471
2472 return padded;
2473}
2474
2475/*
2476 * Calculate out the start and number of virtual clusters we need to to CoW.
2477 *
2478 * cpos is vitual start cluster position we want to do CoW in a
2479 * file and write_len is the cluster length.
2480 *
2481 * Normal we will start CoW from the beginning of extent record cotaining cpos.
2482 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
2483 * get good I/O from the resulting extent tree.
2484 */
2485static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
2486 struct buffer_head *di_bh,
2487 u32 cpos,
2488 u32 write_len,
2489 u32 *cow_start,
2490 u32 *cow_len)
2491{
2492 int ret = 0;
2493 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
2494 struct ocfs2_extent_list *el = &di->id2.i_list;
2495 int tree_height = le16_to_cpu(el->l_tree_depth), i;
2496 struct buffer_head *eb_bh = NULL;
2497 struct ocfs2_extent_block *eb = NULL;
2498 struct ocfs2_extent_rec *rec;
2499 unsigned int want_clusters, rec_end = 0;
2500 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
2501 int leaf_clusters;
2502
2503 if (tree_height > 0) {
2504 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
2505 if (ret) {
2506 mlog_errno(ret);
2507 goto out;
2508 }
2509
2510 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2511 el = &eb->h_list;
2512
2513 if (el->l_tree_depth) {
2514 ocfs2_error(inode->i_sb,
2515 "Inode %lu has non zero tree depth in "
2516 "leaf block %llu\n", inode->i_ino,
2517 (unsigned long long)eb_bh->b_blocknr);
2518 ret = -EROFS;
2519 goto out;
2520 }
2521 }
2522
2523 *cow_len = 0;
2524 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
2525 rec = &el->l_recs[i];
2526
2527 if (ocfs2_is_empty_extent(rec)) {
2528 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
2529 "index %d\n", inode->i_ino, i);
2530 continue;
2531 }
2532
2533 if (le32_to_cpu(rec->e_cpos) +
2534 le16_to_cpu(rec->e_leaf_clusters) <= cpos)
2535 continue;
2536
2537 if (*cow_len == 0) {
2538 /*
2539 * We should find a refcounted record in the
2540 * first pass.
2541 */
2542 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
2543 *cow_start = le32_to_cpu(rec->e_cpos);
2544 }
2545
2546 /*
2547 * If we encounter a hole or a non-refcounted record,
2548 * stop the search.
2549 */
2550 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
2551 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)))
2552 break;
2553
2554 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
2555 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
2556
2557 /*
2558 * How many clusters do we actually need from
2559 * this extent? First we see how many we actually
2560 * need to complete the write. If that's smaller
2561 * than contig_clusters, we try for contig_clusters.
2562 */
2563 if (!*cow_len)
2564 want_clusters = write_len;
2565 else
2566 want_clusters = (cpos + write_len) -
2567 (*cow_start + *cow_len);
2568 if (want_clusters < contig_clusters)
2569 want_clusters = contig_clusters;
2570
2571 /*
2572 * If the write does not cover the whole extent, we
2573 * need to calculate how we're going to split the extent.
2574 * We try to do it on contig_clusters boundaries.
2575 *
2576 * Any extent smaller than contig_clusters will be
2577 * CoWed in its entirety.
2578 */
2579 if (leaf_clusters <= contig_clusters)
2580 *cow_len += leaf_clusters;
2581 else if (*cow_len || (*cow_start == cpos)) {
2582 /*
2583 * This extent needs to be CoW'd from its
2584 * beginning, so all we have to do is compute
2585 * how many clusters to grab. We align
2586 * want_clusters to the edge of contig_clusters
2587 * to get better I/O.
2588 */
2589 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2590 want_clusters);
2591
2592 if (leaf_clusters < want_clusters)
2593 *cow_len += leaf_clusters;
2594 else
2595 *cow_len += want_clusters;
2596 } else if ((*cow_start + contig_clusters) >=
2597 (cpos + write_len)) {
2598 /*
2599 * Breaking off contig_clusters at the front
2600 * of the extent will cover our write. That's
2601 * easy.
2602 */
2603 *cow_len = contig_clusters;
2604 } else if ((rec_end - cpos) <= contig_clusters) {
2605 /*
2606 * Breaking off contig_clusters at the tail of
2607 * this extent will cover cpos.
2608 */
2609 *cow_start = rec_end - contig_clusters;
2610 *cow_len = contig_clusters;
2611 } else if ((rec_end - cpos) <= want_clusters) {
2612 /*
2613 * While we can't fit the entire write in this
2614 * extent, we know that the write goes from cpos
2615 * to the end of the extent. Break that off.
2616 * We try to break it at some multiple of
2617 * contig_clusters from the front of the extent.
2618 * Failing that (ie, cpos is within
2619 * contig_clusters of the front), we'll CoW the
2620 * entire extent.
2621 */
2622 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2623 *cow_start, cpos);
2624 *cow_len = rec_end - *cow_start;
2625 } else {
2626 /*
2627 * Ok, the entire write lives in the middle of
2628 * this extent. Let's try to slice the extent up
2629 * nicely. Optimally, our CoW region starts at
2630 * m*contig_clusters from the beginning of the
2631 * extent and goes for n*contig_clusters,
2632 * covering the entire write.
2633 */
2634 *cow_start = ocfs2_cow_align_start(inode->i_sb,
2635 *cow_start, cpos);
2636
2637 want_clusters = (cpos + write_len) - *cow_start;
2638 want_clusters = ocfs2_cow_align_length(inode->i_sb,
2639 want_clusters);
2640 if (*cow_start + want_clusters <= rec_end)
2641 *cow_len = want_clusters;
2642 else
2643 *cow_len = rec_end - *cow_start;
2644 }
2645
2646 /* Have we covered our entire write yet? */
2647 if ((*cow_start + *cow_len) >= (cpos + write_len))
2648 break;
2649
2650 /*
2651 * If we reach the end of the extent block and don't get enough
2652 * clusters, continue with the next extent block if possible.
2653 */
2654 if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
2655 eb && eb->h_next_leaf_blk) {
2656 brelse(eb_bh);
2657 eb_bh = NULL;
2658
2659 ret = ocfs2_read_extent_block(INODE_CACHE(inode),
2660 le64_to_cpu(eb->h_next_leaf_blk),
2661 &eb_bh);
2662 if (ret) {
2663 mlog_errno(ret);
2664 goto out;
2665 }
2666
2667 eb = (struct ocfs2_extent_block *) eb_bh->b_data;
2668 el = &eb->h_list;
2669 i = -1;
2670 }
2671 }
2672
2673out:
2674 brelse(eb_bh);
2675 return ret;
2676}
2677
2678/*
2679 * Prepare meta_ac, data_ac and calculate credits when we want to add some
2680 * num_clusters in data_tree "et" and change the refcount for the old
2681 * clusters(starting form p_cluster) in the refcount tree.
2682 *
2683 * Note:
2684 * 1. since we may split the old tree, so we at most will need num_clusters + 2
2685 * more new leaf records.
2686 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
2687 * just give data_ac = NULL.
2688 */
2689static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2690 u32 p_cluster, u32 num_clusters,
2691 struct ocfs2_extent_tree *et,
2692 struct ocfs2_caching_info *ref_ci,
2693 struct buffer_head *ref_root_bh,
2694 struct ocfs2_alloc_context **meta_ac,
2695 struct ocfs2_alloc_context **data_ac,
2696 int *credits)
2697{
2698 int ret = 0, meta_add = 0;
2699 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et);
2700
2701 if (num_free_extents < 0) {
2702 ret = num_free_extents;
2703 mlog_errno(ret);
2704 goto out;
2705 }
2706
2707 if (num_free_extents < num_clusters + 2)
2708 meta_add =
2709 ocfs2_extend_meta_needed(et->et_root_el);
2710
2711 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
2712 num_clusters + 2);
2713
2714 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
2715 p_cluster, num_clusters,
2716 &meta_add, credits);
2717 if (ret) {
2718 mlog_errno(ret);
2719 goto out;
2720 }
2721
2722 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
2723 meta_add, num_clusters, *credits);
2724 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2725 meta_ac);
2726 if (ret) {
2727 mlog_errno(ret);
2728 goto out;
2729 }
2730
2731 if (data_ac) {
2732 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
2733 data_ac);
2734 if (ret)
2735 mlog_errno(ret);
2736 }
2737
2738out:
2739 if (ret) {
2740 if (*meta_ac) {
2741 ocfs2_free_alloc_context(*meta_ac);
2742 *meta_ac = NULL;
2743 }
2744 }
2745
2746 return ret;
2747}
2748
2749static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
2750{
2751 BUG_ON(buffer_dirty(bh));
2752
2753 clear_buffer_mapped(bh);
2754
2755 return 0;
2756}
2757
2758static int ocfs2_duplicate_clusters(handle_t *handle,
2759 struct ocfs2_cow_context *context,
2760 u32 cpos, u32 old_cluster,
2761 u32 new_cluster, u32 new_len)
2762{
2763 int ret = 0, partial;
2764 struct ocfs2_caching_info *ci = context->di_et.et_ci;
2765 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2766 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
2767 struct page *page;
2768 pgoff_t page_index;
2769 unsigned int from, to;
2770 loff_t offset, end, map_end;
2771 struct address_space *mapping = context->inode->i_mapping;
2772
2773 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
2774 new_cluster, new_len, cpos);
2775
2776 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2777 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
2778
2779 while (offset < end) {
2780 page_index = offset >> PAGE_CACHE_SHIFT;
2781 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
2782 if (map_end > end)
2783 map_end = end;
2784
2785 /* from, to is the offset within the page. */
2786 from = offset & (PAGE_CACHE_SIZE - 1);
2787 to = PAGE_CACHE_SIZE;
2788 if (map_end & (PAGE_CACHE_SIZE - 1))
2789 to = map_end & (PAGE_CACHE_SIZE - 1);
2790
2791 page = grab_cache_page(mapping, page_index);
2792
2793 /* This page can't be dirtied before we CoW it out. */
2794 BUG_ON(PageDirty(page));
2795
2796 if (!PageUptodate(page)) {
2797 ret = block_read_full_page(page, ocfs2_get_block);
2798 if (ret) {
2799 mlog_errno(ret);
2800 goto unlock;
2801 }
2802 lock_page(page);
2803 }
2804
2805 if (page_has_buffers(page)) {
2806 ret = walk_page_buffers(handle, page_buffers(page),
2807 from, to, &partial,
2808 ocfs2_clear_cow_buffer);
2809 if (ret) {
2810 mlog_errno(ret);
2811 goto unlock;
2812 }
2813 }
2814
2815 ocfs2_map_and_dirty_page(context->inode,
2816 handle, from, to,
2817 page, 0, &new_block);
2818 mark_page_accessed(page);
2819unlock:
2820 unlock_page(page);
2821 page_cache_release(page);
2822 page = NULL;
2823 offset = map_end;
2824 if (ret)
2825 break;
2826 }
2827
2828 return ret;
2829}
2830
2831static int ocfs2_clear_ext_refcount(handle_t *handle,
2832 struct ocfs2_extent_tree *et,
2833 u32 cpos, u32 p_cluster, u32 len,
2834 unsigned int ext_flags,
2835 struct ocfs2_alloc_context *meta_ac,
2836 struct ocfs2_cached_dealloc_ctxt *dealloc)
2837{
2838 int ret, index;
2839 struct ocfs2_extent_rec replace_rec;
2840 struct ocfs2_path *path = NULL;
2841 struct ocfs2_extent_list *el;
2842 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2843 u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
2844
2845 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
2846 (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
2847
2848 memset(&replace_rec, 0, sizeof(replace_rec));
2849 replace_rec.e_cpos = cpu_to_le32(cpos);
2850 replace_rec.e_leaf_clusters = cpu_to_le16(len);
2851 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
2852 p_cluster));
2853 replace_rec.e_flags = ext_flags;
2854 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
2855
2856 path = ocfs2_new_path_from_et(et);
2857 if (!path) {
2858 ret = -ENOMEM;
2859 mlog_errno(ret);
2860 goto out;
2861 }
2862
2863 ret = ocfs2_find_path(et->et_ci, path, cpos);
2864 if (ret) {
2865 mlog_errno(ret);
2866 goto out;
2867 }
2868
2869 el = path_leaf_el(path);
2870
2871 index = ocfs2_search_extent_list(el, cpos);
2872 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
2873 ocfs2_error(sb,
2874 "Inode %llu has an extent at cpos %u which can no "
2875 "longer be found.\n",
2876 (unsigned long long)ino, cpos);
2877 ret = -EROFS;
2878 goto out;
2879 }
2880
2881 ret = ocfs2_split_extent(handle, et, path, index,
2882 &replace_rec, meta_ac, dealloc);
2883 if (ret)
2884 mlog_errno(ret);
2885
2886out:
2887 ocfs2_free_path(path);
2888 return ret;
2889}
2890
2891static int ocfs2_replace_clusters(handle_t *handle,
2892 struct ocfs2_cow_context *context,
2893 u32 cpos, u32 old,
2894 u32 new, u32 len,
2895 unsigned int ext_flags)
2896{
2897 int ret;
2898 struct ocfs2_caching_info *ci = context->di_et.et_ci;
2899 u64 ino = ocfs2_metadata_cache_owner(ci);
2900
2901 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
2902 (unsigned long long)ino, cpos, old, new, len, ext_flags);
2903
2904 /*If the old clusters is unwritten, no need to duplicate. */
2905 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
2906 ret = ocfs2_duplicate_clusters(handle, context, cpos,
2907 old, new, len);
2908 if (ret) {
2909 mlog_errno(ret);
2910 goto out;
2911 }
2912 }
2913
2914 ret = ocfs2_clear_ext_refcount(handle, &context->di_et,
2915 cpos, new, len, ext_flags,
2916 context->meta_ac, &context->dealloc);
2917 if (ret)
2918 mlog_errno(ret);
2919out:
2920 return ret;
2921}
2922
2923static int ocfs2_cow_sync_writeback(struct super_block *sb,
2924 struct ocfs2_cow_context *context,
2925 u32 cpos, u32 num_clusters)
2926{
2927 int ret = 0;
2928 loff_t offset, end, map_end;
2929 pgoff_t page_index;
2930 struct page *page;
2931
2932 if (ocfs2_should_order_data(context->inode))
2933 return 0;
2934
2935 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
2936 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits);
2937
2938 ret = filemap_fdatawrite_range(context->inode->i_mapping,
2939 offset, end - 1);
2940 if (ret < 0) {
2941 mlog_errno(ret);
2942 return ret;
2943 }
2944
2945 while (offset < end) {
2946 page_index = offset >> PAGE_CACHE_SHIFT;
2947 map_end = (page_index + 1) << PAGE_CACHE_SHIFT;
2948 if (map_end > end)
2949 map_end = end;
2950
2951 page = grab_cache_page(context->inode->i_mapping, page_index);
2952 BUG_ON(!page);
2953
2954 wait_on_page_writeback(page);
2955 if (PageError(page)) {
2956 ret = -EIO;
2957 mlog_errno(ret);
2958 } else
2959 mark_page_accessed(page);
2960
2961 unlock_page(page);
2962 page_cache_release(page);
2963 page = NULL;
2964 offset = map_end;
2965 if (ret)
2966 break;
2967 }
2968
2969 return ret;
2970}
2971
2972static int ocfs2_make_clusters_writable(struct super_block *sb,
2973 struct ocfs2_cow_context *context,
2974 u32 cpos, u32 p_cluster,
2975 u32 num_clusters, unsigned int e_flags)
2976{
2977 int ret, credits = 0;
2978 u32 new_bit, new_len;
2979 struct ocfs2_super *osb = OCFS2_SB(sb);
2980 handle_t *handle;
2981
2982 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
2983 &context->di_et,
2984 context->ref_ci,
2985 context->ref_root_bh,
2986 &context->meta_ac,
2987 &context->data_ac, &credits);
2988 if (ret) {
2989 mlog_errno(ret);
2990 return ret;
2991 }
2992
2993 handle = ocfs2_start_trans(osb, credits);
2994 if (IS_ERR(handle)) {
2995 ret = PTR_ERR(handle);
2996 mlog_errno(ret);
2997 goto out;
2998 }
2999
3000 while (num_clusters) {
3001 ret = __ocfs2_claim_clusters(osb, handle, context->data_ac,
3002 1, num_clusters,
3003 &new_bit, &new_len);
3004 if (ret) {
3005 mlog_errno(ret);
3006 goto out_commit;
3007 }
3008
3009 ret = ocfs2_replace_clusters(handle, context,
3010 cpos, p_cluster, new_bit,
3011 new_len, e_flags);
3012 if (ret) {
3013 mlog_errno(ret);
3014 goto out_commit;
3015 }
3016
3017 cpos += new_len;
3018 p_cluster += new_len;
3019 num_clusters -= new_len;
3020 }
3021
3022 ret = __ocfs2_decrease_refcount(handle, context->ref_ci,
3023 context->ref_root_bh,
3024 p_cluster, num_clusters,
3025 context->meta_ac,
3026 &context->dealloc);
3027 if (ret) {
3028 mlog_errno(ret);
3029 goto out_commit;
3030 }
3031
3032 /*
3033 * Here we should write the new page out first if we are
3034 * in write-back mode.
3035 */
3036 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
3037 if (ret)
3038 mlog_errno(ret);
3039
3040out_commit:
3041 ocfs2_commit_trans(osb, handle);
3042
3043out:
3044 if (context->data_ac) {
3045 ocfs2_free_alloc_context(context->data_ac);
3046 context->data_ac = NULL;
3047 }
3048 if (context->meta_ac) {
3049 ocfs2_free_alloc_context(context->meta_ac);
3050 context->meta_ac = NULL;
3051 }
3052
3053 return ret;
3054}
3055
3056static int ocfs2_replace_cow(struct inode *inode,
3057 struct buffer_head *di_bh,
3058 struct buffer_head *ref_root_bh,
3059 struct ocfs2_caching_info *ref_ci,
3060 u32 cow_start, u32 cow_len)
3061{
3062 int ret = 0;
3063 u32 p_cluster, num_clusters, start = cow_start;
3064 unsigned int ext_flags;
3065 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3066 struct ocfs2_cow_context *context;
3067
3068 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
3069 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
3070 "tree, but the feature bit is not set in the "
3071 "super block.", inode->i_ino);
3072 return -EROFS;
3073 }
3074
3075 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
3076 if (!context) {
3077 ret = -ENOMEM;
3078 mlog_errno(ret);
3079 return ret;
3080 }
3081
3082 context->inode = inode;
3083 context->cow_start = cow_start;
3084 context->cow_len = cow_len;
3085 context->ref_ci = ref_ci;
3086 context->ref_root_bh = ref_root_bh;
3087
3088 ocfs2_init_dealloc_ctxt(&context->dealloc);
3089 ocfs2_init_dinode_extent_tree(&context->di_et,
3090 INODE_CACHE(inode), di_bh);
3091
3092 while (cow_len) {
3093 ret = ocfs2_get_clusters(inode, cow_start, &p_cluster,
3094 &num_clusters, &ext_flags);
3095 if (ret) {
3096 mlog_errno(ret);
3097 break;
3098 }
3099
3100 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
3101
3102 if (cow_len < num_clusters)
3103 num_clusters = cow_len;
3104
3105 ret = ocfs2_make_clusters_writable(inode->i_sb, context,
3106 cow_start, p_cluster,
3107 num_clusters, ext_flags);
3108 if (ret) {
3109 mlog_errno(ret);
3110 break;
3111 }
3112
3113 cow_len -= num_clusters;
3114 cow_start += num_clusters;
3115 }
3116
3117
3118 /*
3119 * truncate the extent map here since no matter whether we meet with
3120 * any error during the action, we shouldn't trust cached extent map
3121 * any more.
3122 */
3123 ocfs2_extent_map_trunc(inode, start);
3124
3125 if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
3126 ocfs2_schedule_truncate_log_flush(osb, 1);
3127 ocfs2_run_deallocs(osb, &context->dealloc);
3128 }
3129
3130 kfree(context);
3131 return ret;
3132}
3133
3134/*
3135 * Starting at cpos, try to CoW write_len clusters.
3136 * This will stop when it runs into a hole or an unrefcounted extent.
3137 */
3138static int ocfs2_refcount_cow_hunk(struct inode *inode,
3139 struct buffer_head *di_bh,
3140 u32 cpos, u32 write_len)
3141{
3142 int ret;
3143 u32 cow_start = 0, cow_len = 0;
3144 struct ocfs2_inode_info *oi = OCFS2_I(inode);
3145 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3146 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3147 struct buffer_head *ref_root_bh = NULL;
3148 struct ocfs2_refcount_tree *ref_tree;
3149
3150 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
3151
3152 ret = ocfs2_refcount_cal_cow_clusters(inode, di_bh, cpos, write_len,
3153 &cow_start, &cow_len);
3154 if (ret) {
3155 mlog_errno(ret);
3156 goto out;
3157 }
3158 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
3159 "cow_len %u\n", inode->i_ino,
3160 cpos, write_len, cow_start, cow_len);
3161
3162 BUG_ON(cow_len == 0);
3163
3164 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
3165 1, &ref_tree, &ref_root_bh);
3166 if (ret) {
3167 mlog_errno(ret);
3168 goto out;
3169 }
3170
3171 ret = ocfs2_replace_cow(inode, di_bh, ref_root_bh, &ref_tree->rf_ci,
3172 cow_start, cow_len);
3173 if (ret)
3174 mlog_errno(ret);
3175
3176 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
3177 brelse(ref_root_bh);
3178out:
3179 return ret;
3180}
3181
3182/*
3183 * CoW any and all clusters between cpos and cpos+write_len.
3184 * If this returns successfully, all clusters between cpos and
3185 * cpos+write_len are safe to modify.
3186 */
3187int ocfs2_refcount_cow(struct inode *inode,
3188 struct buffer_head *di_bh,
3189 u32 cpos, u32 write_len)
3190{
3191 int ret = 0;
3192 u32 p_cluster, num_clusters;
3193 unsigned int ext_flags;
3194
3195 while (write_len) {
3196 ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
3197 &num_clusters, &ext_flags);
3198 if (ret) {
3199 mlog_errno(ret);
3200 break;
3201 }
3202
3203 if (write_len < num_clusters)
3204 num_clusters = write_len;
3205
3206 if (ext_flags & OCFS2_EXT_REFCOUNTED) {
3207 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
3208 num_clusters);
3209 if (ret) {
3210 mlog_errno(ret);
3211 break;
3212 }
3213 }
3214
3215 write_len -= num_clusters;
3216 cpos += num_clusters;
3217 }
3218
3219 return ret;
3220}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index b8c9ed7dc383..9960878134df 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -51,4 +51,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
51 u32 clusters, 51 u32 clusters,
52 int *credits, 52 int *credits,
53 struct ocfs2_alloc_context **meta_ac); 53 struct ocfs2_alloc_context **meta_ac);
54int ocfs2_refcount_cow(struct inode *inode, struct buffer_head *di_bh,
55 u32 cpos, u32 write_len);
54#endif /* OCFS2_REFCOUNTTREE_H */ 56#endif /* OCFS2_REFCOUNTTREE_H */