diff options
author | Aditya Kali <adityakali@google.com> | 2011-09-09 19:04:51 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2011-09-09 19:04:51 -0400 |
commit | 7b415bf60f6afb0499fd3dc0ee33444f54e28567 (patch) | |
tree | 9c64fef2b8d60ce64865af6e4c2cc6008026e28c /fs/ext4/extents.c | |
parent | 27baebb849d46d901e756e6502b0a65a62e43771 (diff) |
ext4: Fix bigalloc quota accounting and i_blocks value
With bigalloc changes, the i_blocks value was not correctly set (it was still
set to number of blocks being used, but in case of bigalloc, we want i_blocks
to represent the number of clusters being used). Since the quota subsystem sets
the i_blocks value, this patch fixes the quota accounting and makes sure that
the i_blocks value is set correctly.
Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/extents.c')
-rw-r--r-- | fs/ext4/extents.c | 306 |
1 files changed, 302 insertions, 4 deletions
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index cd4479c08031..c4e005864534 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -2686,6 +2686,21 @@ again: | |||
2686 | } | 2686 | } |
2687 | } | 2687 | } |
2688 | 2688 | ||
2689 | /* If we still have something in the partial cluster and we have removed | ||
2690 | * even the first extent, then we should free the blocks in the partial | ||
2691 | * cluster as well. */ | ||
2692 | if (partial_cluster && path->p_hdr->eh_entries == 0) { | ||
2693 | int flags = EXT4_FREE_BLOCKS_FORGET; | ||
2694 | |||
2695 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
2696 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
2697 | |||
2698 | ext4_free_blocks(handle, inode, NULL, | ||
2699 | EXT4_C2B(EXT4_SB(sb), partial_cluster), | ||
2700 | EXT4_SB(sb)->s_cluster_ratio, flags); | ||
2701 | partial_cluster = 0; | ||
2702 | } | ||
2703 | |||
2689 | /* TODO: flexible tree reduction should be here */ | 2704 | /* TODO: flexible tree reduction should be here */ |
2690 | if (path->p_hdr->eh_entries == 0) { | 2705 | if (path->p_hdr->eh_entries == 0) { |
2691 | /* | 2706 | /* |
@@ -3233,6 +3248,195 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode, | |||
3233 | return ext4_mark_inode_dirty(handle, inode); | 3248 | return ext4_mark_inode_dirty(handle, inode); |
3234 | } | 3249 | } |
3235 | 3250 | ||
3251 | /** | ||
3252 | * ext4_find_delalloc_range: find delayed allocated block in the given range. | ||
3253 | * | ||
3254 | * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns | ||
3255 | * whether there are any buffers marked for delayed allocation. It returns '1' | ||
3256 | * on the first delalloc'ed buffer head found. If no buffer head in the given | ||
3257 | * range is marked for delalloc, it returns 0. | ||
3258 | * lblk_start should always be <= lblk_end. | ||
3259 | * search_hint_reverse is to indicate that searching in reverse from lblk_end to | ||
3260 | * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed | ||
3261 | * block sooner). This is useful when blocks are truncated sequentially from | ||
3262 | * lblk_start towards lblk_end. | ||
3263 | */ | ||
3264 | static int ext4_find_delalloc_range(struct inode *inode, | ||
3265 | ext4_lblk_t lblk_start, | ||
3266 | ext4_lblk_t lblk_end, | ||
3267 | int search_hint_reverse) | ||
3268 | { | ||
3269 | struct address_space *mapping = inode->i_mapping; | ||
3270 | struct buffer_head *head, *bh = NULL; | ||
3271 | struct page *page; | ||
3272 | ext4_lblk_t i, pg_lblk; | ||
3273 | pgoff_t index; | ||
3274 | |||
3275 | /* reverse search wont work if fs block size is less than page size */ | ||
3276 | if (inode->i_blkbits < PAGE_CACHE_SHIFT) | ||
3277 | search_hint_reverse = 0; | ||
3278 | |||
3279 | if (search_hint_reverse) | ||
3280 | i = lblk_end; | ||
3281 | else | ||
3282 | i = lblk_start; | ||
3283 | |||
3284 | index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
3285 | |||
3286 | while ((i >= lblk_start) && (i <= lblk_end)) { | ||
3287 | page = find_get_page(mapping, index); | ||
3288 | if (!page || !PageDirty(page)) | ||
3289 | goto nextpage; | ||
3290 | |||
3291 | if (PageWriteback(page)) { | ||
3292 | /* | ||
3293 | * This might be a race with allocation and writeout. In | ||
3294 | * this case we just assume that the rest of the range | ||
3295 | * will eventually be written and there wont be any | ||
3296 | * delalloc blocks left. | ||
3297 | * TODO: the above assumption is troublesome, but might | ||
3298 | * work better in practice. other option could be note | ||
3299 | * somewhere that the cluster is getting written out and | ||
3300 | * detect that here. | ||
3301 | */ | ||
3302 | page_cache_release(page); | ||
3303 | return 0; | ||
3304 | } | ||
3305 | |||
3306 | if (!page_has_buffers(page)) | ||
3307 | goto nextpage; | ||
3308 | |||
3309 | head = page_buffers(page); | ||
3310 | if (!head) | ||
3311 | goto nextpage; | ||
3312 | |||
3313 | bh = head; | ||
3314 | pg_lblk = index << (PAGE_CACHE_SHIFT - | ||
3315 | inode->i_blkbits); | ||
3316 | do { | ||
3317 | if (unlikely(pg_lblk < lblk_start)) { | ||
3318 | /* | ||
3319 | * This is possible when fs block size is less | ||
3320 | * than page size and our cluster starts/ends in | ||
3321 | * middle of the page. So we need to skip the | ||
3322 | * initial few blocks till we reach the 'lblk' | ||
3323 | */ | ||
3324 | pg_lblk++; | ||
3325 | continue; | ||
3326 | } | ||
3327 | |||
3328 | if (buffer_delay(bh)) { | ||
3329 | page_cache_release(page); | ||
3330 | return 1; | ||
3331 | } | ||
3332 | if (search_hint_reverse) | ||
3333 | i--; | ||
3334 | else | ||
3335 | i++; | ||
3336 | } while ((i >= lblk_start) && (i <= lblk_end) && | ||
3337 | ((bh = bh->b_this_page) != head)); | ||
3338 | nextpage: | ||
3339 | if (page) | ||
3340 | page_cache_release(page); | ||
3341 | /* | ||
3342 | * Move to next page. 'i' will be the first lblk in the next | ||
3343 | * page. | ||
3344 | */ | ||
3345 | if (search_hint_reverse) | ||
3346 | index--; | ||
3347 | else | ||
3348 | index++; | ||
3349 | i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
3350 | } | ||
3351 | |||
3352 | return 0; | ||
3353 | } | ||
3354 | |||
3355 | int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk, | ||
3356 | int search_hint_reverse) | ||
3357 | { | ||
3358 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
3359 | ext4_lblk_t lblk_start, lblk_end; | ||
3360 | lblk_start = lblk & (~(sbi->s_cluster_ratio - 1)); | ||
3361 | lblk_end = lblk_start + sbi->s_cluster_ratio - 1; | ||
3362 | |||
3363 | return ext4_find_delalloc_range(inode, lblk_start, lblk_end, | ||
3364 | search_hint_reverse); | ||
3365 | } | ||
3366 | |||
3367 | /** | ||
3368 | * Determines how many complete clusters (out of those specified by the 'map') | ||
3369 | * are under delalloc and were reserved quota for. | ||
3370 | * This function is called when we are writing out the blocks that were | ||
3371 | * originally written with their allocation delayed, but then the space was | ||
3372 | * allocated using fallocate() before the delayed allocation could be resolved. | ||
3373 | * The cases to look for are: | ||
3374 | * ('=' indicated delayed allocated blocks | ||
3375 | * '-' indicates non-delayed allocated blocks) | ||
3376 | * (a) partial clusters towards beginning and/or end outside of allocated range | ||
3377 | * are not delalloc'ed. | ||
3378 | * Ex: | ||
3379 | * |----c---=|====c====|====c====|===-c----| | ||
3380 | * |++++++ allocated ++++++| | ||
3381 | * ==> 4 complete clusters in above example | ||
3382 | * | ||
3383 | * (b) partial cluster (outside of allocated range) towards either end is | ||
3384 | * marked for delayed allocation. In this case, we will exclude that | ||
3385 | * cluster. | ||
3386 | * Ex: | ||
3387 | * |----====c========|========c========| | ||
3388 | * |++++++ allocated ++++++| | ||
3389 | * ==> 1 complete clusters in above example | ||
3390 | * | ||
3391 | * Ex: | ||
3392 | * |================c================| | ||
3393 | * |++++++ allocated ++++++| | ||
3394 | * ==> 0 complete clusters in above example | ||
3395 | * | ||
3396 | * The ext4_da_update_reserve_space will be called only if we | ||
3397 | * determine here that there were some "entire" clusters that span | ||
3398 | * this 'allocated' range. | ||
3399 | * In the non-bigalloc case, this function will just end up returning num_blks | ||
3400 | * without ever calling ext4_find_delalloc_range. | ||
3401 | */ | ||
3402 | static unsigned int | ||
3403 | get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, | ||
3404 | unsigned int num_blks) | ||
3405 | { | ||
3406 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
3407 | ext4_lblk_t alloc_cluster_start, alloc_cluster_end; | ||
3408 | ext4_lblk_t lblk_from, lblk_to, c_offset; | ||
3409 | unsigned int allocated_clusters = 0; | ||
3410 | |||
3411 | alloc_cluster_start = EXT4_B2C(sbi, lblk_start); | ||
3412 | alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1); | ||
3413 | |||
3414 | /* max possible clusters for this allocation */ | ||
3415 | allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1; | ||
3416 | |||
3417 | /* Check towards left side */ | ||
3418 | c_offset = lblk_start & (sbi->s_cluster_ratio - 1); | ||
3419 | if (c_offset) { | ||
3420 | lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1)); | ||
3421 | lblk_to = lblk_from + c_offset - 1; | ||
3422 | |||
3423 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) | ||
3424 | allocated_clusters--; | ||
3425 | } | ||
3426 | |||
3427 | /* Now check towards right. */ | ||
3428 | c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1); | ||
3429 | if (allocated_clusters && c_offset) { | ||
3430 | lblk_from = lblk_start + num_blks; | ||
3431 | lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1; | ||
3432 | |||
3433 | if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0)) | ||
3434 | allocated_clusters--; | ||
3435 | } | ||
3436 | |||
3437 | return allocated_clusters; | ||
3438 | } | ||
3439 | |||
3236 | static int | 3440 | static int |
3237 | ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, | 3441 | ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, |
3238 | struct ext4_map_blocks *map, | 3442 | struct ext4_map_blocks *map, |
@@ -3338,8 +3542,15 @@ out: | |||
3338 | * But fallocate would have already updated quota and block | 3542 | * But fallocate would have already updated quota and block |
3339 | * count for this offset. So cancel these reservation | 3543 | * count for this offset. So cancel these reservation |
3340 | */ | 3544 | */ |
3341 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 3545 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { |
3342 | ext4_da_update_reserve_space(inode, allocated, 0); | 3546 | unsigned int reserved_clusters; |
3547 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
3548 | map->m_lblk, map->m_len); | ||
3549 | if (reserved_clusters) | ||
3550 | ext4_da_update_reserve_space(inode, | ||
3551 | reserved_clusters, | ||
3552 | 0); | ||
3553 | } | ||
3343 | 3554 | ||
3344 | map_out: | 3555 | map_out: |
3345 | map->m_flags |= EXT4_MAP_MAPPED; | 3556 | map->m_flags |= EXT4_MAP_MAPPED; |
@@ -3484,6 +3695,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3484 | ext4_fsblk_t newblock = 0; | 3695 | ext4_fsblk_t newblock = 0; |
3485 | int free_on_err = 0, err = 0, depth, ret; | 3696 | int free_on_err = 0, err = 0, depth, ret; |
3486 | unsigned int allocated = 0, offset = 0; | 3697 | unsigned int allocated = 0, offset = 0; |
3698 | unsigned int allocated_clusters = 0, reserved_clusters = 0; | ||
3487 | unsigned int punched_out = 0; | 3699 | unsigned int punched_out = 0; |
3488 | unsigned int result = 0; | 3700 | unsigned int result = 0; |
3489 | struct ext4_allocation_request ar; | 3701 | struct ext4_allocation_request ar; |
@@ -3499,6 +3711,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3499 | if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && | 3711 | if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && |
3500 | ext4_ext_in_cache(inode, map->m_lblk, &newex)) { | 3712 | ext4_ext_in_cache(inode, map->m_lblk, &newex)) { |
3501 | if (!newex.ee_start_lo && !newex.ee_start_hi) { | 3713 | if (!newex.ee_start_lo && !newex.ee_start_hi) { |
3714 | if ((sbi->s_cluster_ratio > 1) && | ||
3715 | ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) | ||
3716 | map->m_flags |= EXT4_MAP_FROM_CLUSTER; | ||
3717 | |||
3502 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | 3718 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
3503 | /* | 3719 | /* |
3504 | * block isn't allocated yet and | 3720 | * block isn't allocated yet and |
@@ -3509,6 +3725,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3509 | /* we should allocate requested block */ | 3725 | /* we should allocate requested block */ |
3510 | } else { | 3726 | } else { |
3511 | /* block is already allocated */ | 3727 | /* block is already allocated */ |
3728 | if (sbi->s_cluster_ratio > 1) | ||
3729 | map->m_flags |= EXT4_MAP_FROM_CLUSTER; | ||
3512 | newblock = map->m_lblk | 3730 | newblock = map->m_lblk |
3513 | - le32_to_cpu(newex.ee_block) | 3731 | - le32_to_cpu(newex.ee_block) |
3514 | + ext4_ext_pblock(&newex); | 3732 | + ext4_ext_pblock(&newex); |
@@ -3665,6 +3883,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3665 | } | 3883 | } |
3666 | } | 3884 | } |
3667 | 3885 | ||
3886 | if ((sbi->s_cluster_ratio > 1) && | ||
3887 | ext4_find_delalloc_cluster(inode, map->m_lblk, 0)) | ||
3888 | map->m_flags |= EXT4_MAP_FROM_CLUSTER; | ||
3889 | |||
3668 | /* | 3890 | /* |
3669 | * requested block isn't allocated yet; | 3891 | * requested block isn't allocated yet; |
3670 | * we couldn't try to create block if create flag is zero | 3892 | * we couldn't try to create block if create flag is zero |
@@ -3681,6 +3903,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3681 | /* | 3903 | /* |
3682 | * Okay, we need to do block allocation. | 3904 | * Okay, we need to do block allocation. |
3683 | */ | 3905 | */ |
3906 | map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; | ||
3684 | newex.ee_block = cpu_to_le32(map->m_lblk); | 3907 | newex.ee_block = cpu_to_le32(map->m_lblk); |
3685 | cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); | 3908 | cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1); |
3686 | 3909 | ||
@@ -3692,6 +3915,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3692 | get_implied_cluster_alloc(sbi, map, ex, path)) { | 3915 | get_implied_cluster_alloc(sbi, map, ex, path)) { |
3693 | ar.len = allocated = map->m_len; | 3916 | ar.len = allocated = map->m_len; |
3694 | newblock = map->m_pblk; | 3917 | newblock = map->m_pblk; |
3918 | map->m_flags |= EXT4_MAP_FROM_CLUSTER; | ||
3695 | goto got_allocated_blocks; | 3919 | goto got_allocated_blocks; |
3696 | } | 3920 | } |
3697 | 3921 | ||
@@ -3712,6 +3936,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3712 | get_implied_cluster_alloc(sbi, map, ex2, path)) { | 3936 | get_implied_cluster_alloc(sbi, map, ex2, path)) { |
3713 | ar.len = allocated = map->m_len; | 3937 | ar.len = allocated = map->m_len; |
3714 | newblock = map->m_pblk; | 3938 | newblock = map->m_pblk; |
3939 | map->m_flags |= EXT4_MAP_FROM_CLUSTER; | ||
3715 | goto got_allocated_blocks; | 3940 | goto got_allocated_blocks; |
3716 | } | 3941 | } |
3717 | 3942 | ||
@@ -3765,6 +3990,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3765 | ext_debug("allocate new block: goal %llu, found %llu/%u\n", | 3990 | ext_debug("allocate new block: goal %llu, found %llu/%u\n", |
3766 | ar.goal, newblock, allocated); | 3991 | ar.goal, newblock, allocated); |
3767 | free_on_err = 1; | 3992 | free_on_err = 1; |
3993 | allocated_clusters = ar.len; | ||
3768 | ar.len = EXT4_C2B(sbi, ar.len) - offset; | 3994 | ar.len = EXT4_C2B(sbi, ar.len) - offset; |
3769 | if (ar.len > allocated) | 3995 | if (ar.len > allocated) |
3770 | ar.len = allocated; | 3996 | ar.len = allocated; |
@@ -3822,8 +4048,80 @@ got_allocated_blocks: | |||
3822 | * Update reserved blocks/metadata blocks after successful | 4048 | * Update reserved blocks/metadata blocks after successful |
3823 | * block allocation which had been deferred till now. | 4049 | * block allocation which had been deferred till now. |
3824 | */ | 4050 | */ |
3825 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) | 4051 | if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { |
3826 | ext4_da_update_reserve_space(inode, allocated, 1); | 4052 | /* |
4053 | * Check how many clusters we had reserved this allocted range. | ||
4054 | */ | ||
4055 | reserved_clusters = get_reserved_cluster_alloc(inode, | ||
4056 | map->m_lblk, allocated); | ||
4057 | if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { | ||
4058 | if (reserved_clusters) { | ||
4059 | /* | ||
4060 | * We have clusters reserved for this range. | ||
4061 | * But since we are not doing actual allocation | ||
4062 | * and are simply using blocks from previously | ||
4063 | * allocated cluster, we should release the | ||
4064 | * reservation and not claim quota. | ||
4065 | */ | ||
4066 | ext4_da_update_reserve_space(inode, | ||
4067 | reserved_clusters, 0); | ||
4068 | } | ||
4069 | } else { | ||
4070 | BUG_ON(allocated_clusters < reserved_clusters); | ||
4071 | /* We will claim quota for all newly allocated blocks.*/ | ||
4072 | ext4_da_update_reserve_space(inode, allocated_clusters, | ||
4073 | 1); | ||
4074 | if (reserved_clusters < allocated_clusters) { | ||
4075 | int reservation = allocated_clusters - | ||
4076 | reserved_clusters; | ||
4077 | /* | ||
4078 | * It seems we claimed few clusters outside of | ||
4079 | * the range of this allocation. We should give | ||
4080 | * it back to the reservation pool. This can | ||
4081 | * happen in the following case: | ||
4082 | * | ||
4083 | * * Suppose s_cluster_ratio is 4 (i.e., each | ||
4084 | * cluster has 4 blocks. Thus, the clusters | ||
4085 | * are [0-3],[4-7],[8-11]... | ||
4086 | * * First comes delayed allocation write for | ||
4087 | * logical blocks 10 & 11. Since there were no | ||
4088 | * previous delayed allocated blocks in the | ||
4089 | * range [8-11], we would reserve 1 cluster | ||
4090 | * for this write. | ||
4091 | * * Next comes write for logical blocks 3 to 8. | ||
4092 | * In this case, we will reserve 2 clusters | ||
4093 | * (for [0-3] and [4-7]; and not for [8-11] as | ||
4094 | * that range has a delayed allocated blocks. | ||
4095 | * Thus total reserved clusters now becomes 3. | ||
4096 | * * Now, during the delayed allocation writeout | ||
4097 | * time, we will first write blocks [3-8] and | ||
4098 | * allocate 3 clusters for writing these | ||
4099 | * blocks. Also, we would claim all these | ||
4100 | * three clusters above. | ||
4101 | * * Now when we come here to writeout the | ||
4102 | * blocks [10-11], we would expect to claim | ||
4103 | * the reservation of 1 cluster we had made | ||
4104 | * (and we would claim it since there are no | ||
4105 | * more delayed allocated blocks in the range | ||
4106 | * [8-11]. But our reserved cluster count had | ||
4107 | * already gone to 0. | ||
4108 | * | ||
4109 | * Thus, at the step 4 above when we determine | ||
4110 | * that there are still some unwritten delayed | ||
4111 | * allocated blocks outside of our current | ||
4112 | * block range, we should increment the | ||
4113 | * reserved clusters count so that when the | ||
4114 | * remaining blocks finally gets written, we | ||
4115 | * could claim them. | ||
4116 | */ | ||
4117 | while (reservation) { | ||
4118 | ext4_da_reserve_space(inode, | ||
4119 | map->m_lblk); | ||
4120 | reservation--; | ||
4121 | } | ||
4122 | } | ||
4123 | } | ||
4124 | } | ||
3827 | 4125 | ||
3828 | /* | 4126 | /* |
3829 | * Cache the extent and update transaction to commit on fdatasync only | 4127 | * Cache the extent and update transaction to commit on fdatasync only |