aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/extents.c
diff options
context:
space:
mode:
authorEric Gouriou <egouriou@google.com>2011-10-27 11:43:23 -0400
committerTheodore Ts'o <tytso@mit.edu>2011-10-27 11:43:23 -0400
commit6f91bc5fda82d2c49b4f7fb29424cf6a3c7574bc (patch)
treee5670070f007c4ad5300e93a4e36fa9a802e2bd9 /fs/ext4/extents.c
parent446066724c3629664e29942a00b0aee0d6b1663a (diff)
ext4: optimize ext4_ext_convert_to_initialized()
This patch introduces a fast path in ext4_ext_convert_to_initialized() for the case when the conversion can be performed by transferring the newly initialized blocks from the uninitialized extent into an adjacent initialized extent. Doing so removes the expensive invocations of memmove() which occur during extent insertion and the subsequent merge. In practice this should be the common case for clients performing append writes into files pre-allocated via fallocate(FALLOC_FL_KEEP_SIZE). In such a workload performed via direct IO and when using a suboptimal implementation of memmove() (x86_64 prior to the 2.6.39 rewrite), this patch reduces kernel CPU consumption by 32%. Two new trace points are added to ext4_ext_convert_to_initialized() to offer visibility into its operations. No exit trace point has been added due to the multiplicity of return points. This can be revisited once the upstream cleanup is backported. Signed-off-by: Eric Gouriou <egouriou@google.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/extents.c')
-rw-r--r--fs/ext4/extents.c93
1 files changed, 93 insertions, 0 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c2ac06cb2d4..8b6a17b6097 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2919,12 +2919,23 @@ out:
2919 * a> There is no split required: Entire extent should be initialized 2919 * a> There is no split required: Entire extent should be initialized
2920 * b> Splits in two extents: Write is happening at either end of the extent 2920 * b> Splits in two extents: Write is happening at either end of the extent
2921 * c> Splits in three extents: Somone is writing in middle of the extent 2921 * c> Splits in three extents: Somone is writing in middle of the extent
2922 *
2923 * Pre-conditions:
2924 * - The extent pointed to by 'path' is uninitialized.
2925 * - The extent pointed to by 'path' contains a superset
2926 * of the logical span [map->m_lblk, map->m_lblk + map->m_len).
2927 *
2928 * Post-conditions on success:
2929 * - the returned value is the number of blocks beyond map->l_lblk
2930 * that are allocated and initialized.
2931 * It is guaranteed to be >= map->m_len.
2922 */ 2932 */
2923static int ext4_ext_convert_to_initialized(handle_t *handle, 2933static int ext4_ext_convert_to_initialized(handle_t *handle,
2924 struct inode *inode, 2934 struct inode *inode,
2925 struct ext4_map_blocks *map, 2935 struct ext4_map_blocks *map,
2926 struct ext4_ext_path *path) 2936 struct ext4_ext_path *path)
2927{ 2937{
2938 struct ext4_extent_header *eh;
2928 struct ext4_map_blocks split_map; 2939 struct ext4_map_blocks split_map;
2929 struct ext4_extent zero_ex; 2940 struct ext4_extent zero_ex;
2930 struct ext4_extent *ex; 2941 struct ext4_extent *ex;
@@ -2944,11 +2955,93 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2944 eof_block = map->m_lblk + map->m_len; 2955 eof_block = map->m_lblk + map->m_len;
2945 2956
2946 depth = ext_depth(inode); 2957 depth = ext_depth(inode);
2958 eh = path[depth].p_hdr;
2947 ex = path[depth].p_ext; 2959 ex = path[depth].p_ext;
2948 ee_block = le32_to_cpu(ex->ee_block); 2960 ee_block = le32_to_cpu(ex->ee_block);
2949 ee_len = ext4_ext_get_actual_len(ex); 2961 ee_len = ext4_ext_get_actual_len(ex);
2950 allocated = ee_len - (map->m_lblk - ee_block); 2962 allocated = ee_len - (map->m_lblk - ee_block);
2951 2963
2964 trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
2965
2966 /* Pre-conditions */
2967 BUG_ON(!ext4_ext_is_uninitialized(ex));
2968 BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
2969 BUG_ON(map->m_lblk + map->m_len > ee_block + ee_len);
2970
2971 /*
2972 * Attempt to transfer newly initialized blocks from the currently
2973 * uninitialized extent to its left neighbor. This is much cheaper
2974 * than an insertion followed by a merge as those involve costly
2975 * memmove() calls. This is the common case in steady state for
2976 * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
2977 * writes.
2978 *
2979 * Limitations of the current logic:
2980 * - L1: we only deal with writes at the start of the extent.
2981 * The approach could be extended to writes at the end
2982 * of the extent but this scenario was deemed less common.
2983 * - L2: we do not deal with writes covering the whole extent.
2984 * This would require removing the extent if the transfer
2985 * is possible.
2986 * - L3: we only attempt to merge with an extent stored in the
2987 * same extent tree node.
2988 */
2989 if ((map->m_lblk == ee_block) && /*L1*/
2990 (map->m_len < ee_len) && /*L2*/
2991 (ex > EXT_FIRST_EXTENT(eh))) { /*L3*/
2992 struct ext4_extent *prev_ex;
2993 ext4_lblk_t prev_lblk;
2994 ext4_fsblk_t prev_pblk, ee_pblk;
2995 unsigned int prev_len, write_len;
2996
2997 prev_ex = ex - 1;
2998 prev_lblk = le32_to_cpu(prev_ex->ee_block);
2999 prev_len = ext4_ext_get_actual_len(prev_ex);
3000 prev_pblk = ext4_ext_pblock(prev_ex);
3001 ee_pblk = ext4_ext_pblock(ex);
3002 write_len = map->m_len;
3003
3004 /*
3005 * A transfer of blocks from 'ex' to 'prev_ex' is allowed
3006 * upon those conditions:
3007 * - C1: prev_ex is initialized,
3008 * - C2: prev_ex is logically abutting ex,
3009 * - C3: prev_ex is physically abutting ex,
3010 * - C4: prev_ex can receive the additional blocks without
3011 * overflowing the (initialized) length limit.
3012 */
3013 if ((!ext4_ext_is_uninitialized(prev_ex)) && /*C1*/
3014 ((prev_lblk + prev_len) == ee_block) && /*C2*/
3015 ((prev_pblk + prev_len) == ee_pblk) && /*C3*/
3016 (prev_len < (EXT_INIT_MAX_LEN - write_len))) { /*C4*/
3017 err = ext4_ext_get_access(handle, inode, path + depth);
3018 if (err)
3019 goto out;
3020
3021 trace_ext4_ext_convert_to_initialized_fastpath(inode,
3022 map, ex, prev_ex);
3023
3024 /* Shift the start of ex by 'write_len' blocks */
3025 ex->ee_block = cpu_to_le32(ee_block + write_len);
3026 ext4_ext_store_pblock(ex, ee_pblk + write_len);
3027 ex->ee_len = cpu_to_le16(ee_len - write_len);
3028 ext4_ext_mark_uninitialized(ex); /* Restore the flag */
3029
3030 /* Extend prev_ex by 'write_len' blocks */
3031 prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
3032
3033 /* Mark the block containing both extents as dirty */
3034 ext4_ext_dirty(handle, inode, path + depth);
3035
3036 /* Update path to point to the right extent */
3037 path[depth].p_ext = prev_ex;
3038
3039 /* Result: number of initialized blocks past m_lblk */
3040 allocated = write_len;
3041 goto out;
3042 }
3043 }
3044
2952 WARN_ON(map->m_lblk < ee_block); 3045 WARN_ON(map->m_lblk < ee_block);
2953 /* 3046 /*
2954 * It is safe to convert extent to initialized via explicit 3047 * It is safe to convert extent to initialized via explicit