aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c893
1 files changed, 579 insertions, 314 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4b4ad4b7ce57..6ed859d56850 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
92 * between CPUs. It is possible to get scheduled at this point. 92 * between CPUs. It is possible to get scheduled at this point.
93 * 93 *
94 * The locality group prealloc space is used looking at whether we have 94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) withing the prealloc space. 95 * enough free space (pa_free) within the prealloc space.
96 * 96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group 97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented 98 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -338,6 +338,19 @@
338static struct kmem_cache *ext4_pspace_cachep; 338static struct kmem_cache *ext4_pspace_cachep;
339static struct kmem_cache *ext4_ac_cachep; 339static struct kmem_cache *ext4_ac_cachep;
340static struct kmem_cache *ext4_free_ext_cachep; 340static struct kmem_cache *ext4_free_ext_cachep;
341
342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES 8
346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
347
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
341static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
342 ext4_group_t group); 355 ext4_group_t group);
343static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -419,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
419 } 432 }
420 433
421 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
422 *max = 1 << (e4b->bd_blkbits + 3); 435 if (order == 0) {
423 if (order == 0) 436 *max = 1 << (e4b->bd_blkbits + 3);
424 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 }
425 439
426 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
427 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -603,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
603 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
604 618
605 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
606 buddy = mb_find_buddy(e4b, 0, &max);
607 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
608 ext4_group_t groupnr; 621 ext4_group_t groupnr;
609 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
@@ -622,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622#define mb_check_buddy(e4b) 635#define mb_check_buddy(e4b)
623#endif 636#endif
624 637
625/* FIXME!! need more doc */ 638/*
639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size.
643 */
626static void ext4_mb_mark_free_simple(struct super_block *sb, 644static void ext4_mb_mark_free_simple(struct super_block *sb,
627 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
628 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
@@ -769,6 +787,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
769 struct inode *inode; 787 struct inode *inode;
770 char *data; 788 char *data;
771 char *bitmap; 789 char *bitmap;
790 struct ext4_group_info *grinfo;
772 791
773 mb_debug(1, "init page %lu\n", page->index); 792 mb_debug(1, "init page %lu\n", page->index);
774 793
@@ -801,6 +820,18 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
801 if (first_group + i >= ngroups) 820 if (first_group + i >= ngroups)
802 break; 821 break;
803 822
823 grinfo = ext4_get_group_info(sb, first_group + i);
824 /*
825 * If page is uptodate then we came here after online resize
826 * which added some new uninitialized group info structs, so
827 * we must skip all initialized uptodate buddies on the page,
828 * which may be currently in use by an allocating task.
829 */
830 if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
831 bh[i] = NULL;
832 continue;
833 }
834
804 err = -EIO; 835 err = -EIO;
805 desc = ext4_get_group_desc(sb, first_group + i, NULL); 836 desc = ext4_get_group_desc(sb, first_group + i, NULL);
806 if (desc == NULL) 837 if (desc == NULL)
@@ -853,26 +884,28 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
853 } 884 }
854 885
855 /* wait for I/O completion */ 886 /* wait for I/O completion */
856 for (i = 0; i < groups_per_page && bh[i]; i++) 887 for (i = 0; i < groups_per_page; i++)
857 wait_on_buffer(bh[i]); 888 if (bh[i])
889 wait_on_buffer(bh[i]);
858 890
859 err = -EIO; 891 err = -EIO;
860 for (i = 0; i < groups_per_page && bh[i]; i++) 892 for (i = 0; i < groups_per_page; i++)
861 if (!buffer_uptodate(bh[i])) 893 if (bh[i] && !buffer_uptodate(bh[i]))
862 goto out; 894 goto out;
863 895
864 err = 0; 896 err = 0;
865 first_block = page->index * blocks_per_page; 897 first_block = page->index * blocks_per_page;
866 /* init the page */
867 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
868 for (i = 0; i < blocks_per_page; i++) { 898 for (i = 0; i < blocks_per_page; i++) {
869 int group; 899 int group;
870 struct ext4_group_info *grinfo;
871 900
872 group = (first_block + i) >> 1; 901 group = (first_block + i) >> 1;
873 if (group >= ngroups) 902 if (group >= ngroups)
874 break; 903 break;
875 904
905 if (!bh[group - first_group])
906 /* skip initialized uptodate buddy */
907 continue;
908
876 /* 909 /*
877 * data carry information regarding this 910 * data carry information regarding this
878 * particular group in the format specified 911 * particular group in the format specified
@@ -901,6 +934,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
901 * incore got set to the group block bitmap below 934 * incore got set to the group block bitmap below
902 */ 935 */
903 ext4_lock_group(sb, group); 936 ext4_lock_group(sb, group);
937 /* init the buddy */
938 memset(data, 0xff, blocksize);
904 ext4_mb_generate_buddy(sb, data, incore, group); 939 ext4_mb_generate_buddy(sb, data, incore, group);
905 ext4_unlock_group(sb, group); 940 ext4_unlock_group(sb, group);
906 incore = NULL; 941 incore = NULL;
@@ -930,7 +965,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
930 965
931out: 966out:
932 if (bh) { 967 if (bh) {
933 for (i = 0; i < groups_per_page && bh[i]; i++) 968 for (i = 0; i < groups_per_page; i++)
934 brelse(bh[i]); 969 brelse(bh[i]);
935 if (bh != &bhs) 970 if (bh != &bhs)
936 kfree(bh); 971 kfree(bh);
@@ -939,6 +974,67 @@ out:
939} 974}
940 975
941/* 976/*
977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
978 * on the same buddy page doesn't happen whild holding the buddy page lock.
979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
981 */
982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
983 ext4_group_t group, struct ext4_buddy *e4b)
984{
985 struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
986 int block, pnum, poff;
987 int blocks_per_page;
988 struct page *page;
989
990 e4b->bd_buddy_page = NULL;
991 e4b->bd_bitmap_page = NULL;
992
993 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
994 /*
995 * the buddy cache inode stores the block bitmap
996 * and buddy information in consecutive blocks.
997 * So for each group we need two blocks.
998 */
999 block = group * 2;
1000 pnum = block / blocks_per_page;
1001 poff = block % blocks_per_page;
1002 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1003 if (!page)
1004 return -EIO;
1005 BUG_ON(page->mapping != inode->i_mapping);
1006 e4b->bd_bitmap_page = page;
1007 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1008
1009 if (blocks_per_page >= 2) {
1010 /* buddy and bitmap are on the same page */
1011 return 0;
1012 }
1013
1014 block++;
1015 pnum = block / blocks_per_page;
1016 poff = block % blocks_per_page;
1017 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018 if (!page)
1019 return -EIO;
1020 BUG_ON(page->mapping != inode->i_mapping);
1021 e4b->bd_buddy_page = page;
1022 return 0;
1023}
1024
1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1026{
1027 if (e4b->bd_bitmap_page) {
1028 unlock_page(e4b->bd_bitmap_page);
1029 page_cache_release(e4b->bd_bitmap_page);
1030 }
1031 if (e4b->bd_buddy_page) {
1032 unlock_page(e4b->bd_buddy_page);
1033 page_cache_release(e4b->bd_buddy_page);
1034 }
1035}
1036
1037/*
942 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1038 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
943 * block group lock of all groups for this page; do not hold the BG lock when 1039 * block group lock of all groups for this page; do not hold the BG lock when
944 * calling this routine! 1040 * calling this routine!
@@ -947,93 +1043,60 @@ static noinline_for_stack
947int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
948{ 1044{
949 1045
950 int ret = 0;
951 void *bitmap;
952 int blocks_per_page;
953 int block, pnum, poff;
954 int num_grp_locked = 0;
955 struct ext4_group_info *this_grp; 1046 struct ext4_group_info *this_grp;
956 struct ext4_sb_info *sbi = EXT4_SB(sb); 1047 struct ext4_buddy e4b;
957 struct inode *inode = sbi->s_buddy_cache; 1048 struct page *page;
958 struct page *page = NULL, *bitmap_page = NULL; 1049 int ret = 0;
959 1050
960 mb_debug(1, "init group %u\n", group); 1051 mb_debug(1, "init group %u\n", group);
961 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
962 this_grp = ext4_get_group_info(sb, group); 1052 this_grp = ext4_get_group_info(sb, group);
963 /* 1053 /*
964 * This ensures that we don't reinit the buddy cache 1054 * This ensures that we don't reinit the buddy cache
965 * page which map to the group from which we are already 1055 * page which map to the group from which we are already
966 * allocating. If we are looking at the buddy cache we would 1056 * allocating. If we are looking at the buddy cache we would
967 * have taken a reference using ext4_mb_load_buddy and that 1057 * have taken a reference using ext4_mb_load_buddy and that
968 * would have taken the alloc_sem lock. 1058 * would have pinned buddy page to page cache.
969 */ 1059 */
970 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1060 ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
971 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1061 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
972 /* 1062 /*
973 * somebody initialized the group 1063 * somebody initialized the group
974 * return without doing anything 1064 * return without doing anything
975 */ 1065 */
976 ret = 0;
977 goto err; 1066 goto err;
978 } 1067 }
979 /* 1068
980 * the buddy cache inode stores the block bitmap 1069 page = e4b.bd_bitmap_page;
981 * and buddy information in consecutive blocks. 1070 ret = ext4_mb_init_cache(page, NULL);
982 * So for each group we need two blocks. 1071 if (ret)
983 */ 1072 goto err;
984 block = group * 2; 1073 if (!PageUptodate(page)) {
985 pnum = block / blocks_per_page;
986 poff = block % blocks_per_page;
987 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
988 if (page) {
989 BUG_ON(page->mapping != inode->i_mapping);
990 ret = ext4_mb_init_cache(page, NULL);
991 if (ret) {
992 unlock_page(page);
993 goto err;
994 }
995 unlock_page(page);
996 }
997 if (page == NULL || !PageUptodate(page)) {
998 ret = -EIO; 1074 ret = -EIO;
999 goto err; 1075 goto err;
1000 } 1076 }
1001 mark_page_accessed(page); 1077 mark_page_accessed(page);
1002 bitmap_page = page;
1003 bitmap = page_address(page) + (poff * sb->s_blocksize);
1004 1078
1005 /* init buddy cache */ 1079 if (e4b.bd_buddy_page == NULL) {
1006 block++;
1007 pnum = block / blocks_per_page;
1008 poff = block % blocks_per_page;
1009 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1010 if (page == bitmap_page) {
1011 /* 1080 /*
1012 * If both the bitmap and buddy are in 1081 * If both the bitmap and buddy are in
1013 * the same page we don't need to force 1082 * the same page we don't need to force
1014 * init the buddy 1083 * init the buddy
1015 */ 1084 */
1016 unlock_page(page); 1085 ret = 0;
1017 } else if (page) { 1086 goto err;
1018 BUG_ON(page->mapping != inode->i_mapping);
1019 ret = ext4_mb_init_cache(page, bitmap);
1020 if (ret) {
1021 unlock_page(page);
1022 goto err;
1023 }
1024 unlock_page(page);
1025 } 1087 }
1026 if (page == NULL || !PageUptodate(page)) { 1088 /* init buddy cache */
1089 page = e4b.bd_buddy_page;
1090 ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091 if (ret)
1092 goto err;
1093 if (!PageUptodate(page)) {
1027 ret = -EIO; 1094 ret = -EIO;
1028 goto err; 1095 goto err;
1029 } 1096 }
1030 mark_page_accessed(page); 1097 mark_page_accessed(page);
1031err: 1098err:
1032 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1099 ext4_mb_put_buddy_page_lock(&e4b);
1033 if (bitmap_page)
1034 page_cache_release(bitmap_page);
1035 if (page)
1036 page_cache_release(page);
1037 return ret; 1100 return ret;
1038} 1101}
1039 1102
@@ -1067,24 +1130,8 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1067 e4b->bd_group = group; 1130 e4b->bd_group = group;
1068 e4b->bd_buddy_page = NULL; 1131 e4b->bd_buddy_page = NULL;
1069 e4b->bd_bitmap_page = NULL; 1132 e4b->bd_bitmap_page = NULL;
1070 e4b->alloc_semp = &grp->alloc_sem;
1071
1072 /* Take the read lock on the group alloc
1073 * sem. This would make sure a parallel
1074 * ext4_mb_init_group happening on other
1075 * groups mapped by the page is blocked
1076 * till we are done with allocation
1077 */
1078repeat_load_buddy:
1079 down_read(e4b->alloc_semp);
1080 1133
1081 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1134 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1082 /* we need to check for group need init flag
1083 * with alloc_semp held so that we can be sure
1084 * that new blocks didn't get added to the group
1085 * when we are loading the buddy cache
1086 */
1087 up_read(e4b->alloc_semp);
1088 /* 1135 /*
1089 * we need full data about the group 1136 * we need full data about the group
1090 * to make a good selection 1137 * to make a good selection
@@ -1092,7 +1139,6 @@ repeat_load_buddy:
1092 ret = ext4_mb_init_group(sb, group); 1139 ret = ext4_mb_init_group(sb, group);
1093 if (ret) 1140 if (ret)
1094 return ret; 1141 return ret;
1095 goto repeat_load_buddy;
1096 } 1142 }
1097 1143
1098 /* 1144 /*
@@ -1176,15 +1222,14 @@ repeat_load_buddy:
1176 return 0; 1222 return 0;
1177 1223
1178err: 1224err:
1225 if (page)
1226 page_cache_release(page);
1179 if (e4b->bd_bitmap_page) 1227 if (e4b->bd_bitmap_page)
1180 page_cache_release(e4b->bd_bitmap_page); 1228 page_cache_release(e4b->bd_bitmap_page);
1181 if (e4b->bd_buddy_page) 1229 if (e4b->bd_buddy_page)
1182 page_cache_release(e4b->bd_buddy_page); 1230 page_cache_release(e4b->bd_buddy_page);
1183 e4b->bd_buddy = NULL; 1231 e4b->bd_buddy = NULL;
1184 e4b->bd_bitmap = NULL; 1232 e4b->bd_bitmap = NULL;
1185
1186 /* Done with the buddy cache */
1187 up_read(e4b->alloc_semp);
1188 return ret; 1233 return ret;
1189} 1234}
1190 1235
@@ -1194,9 +1239,6 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1194 page_cache_release(e4b->bd_bitmap_page); 1239 page_cache_release(e4b->bd_bitmap_page);
1195 if (e4b->bd_buddy_page) 1240 if (e4b->bd_buddy_page)
1196 page_cache_release(e4b->bd_buddy_page); 1241 page_cache_release(e4b->bd_buddy_page);
1197 /* Done with the buddy cache */
1198 if (e4b->alloc_semp)
1199 up_read(e4b->alloc_semp);
1200} 1242}
1201 1243
1202 1244
@@ -1509,9 +1551,6 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1509 get_page(ac->ac_bitmap_page); 1551 get_page(ac->ac_bitmap_page);
1510 ac->ac_buddy_page = e4b->bd_buddy_page; 1552 ac->ac_buddy_page = e4b->bd_buddy_page;
1511 get_page(ac->ac_buddy_page); 1553 get_page(ac->ac_buddy_page);
1512 /* on allocation we use ac to track the held semaphore */
1513 ac->alloc_semp = e4b->alloc_semp;
1514 e4b->alloc_semp = NULL;
1515 /* store last allocated for subsequent stream allocation */ 1554 /* store last allocated for subsequent stream allocation */
1516 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1555 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1517 spin_lock(&sbi->s_md_lock); 1556 spin_lock(&sbi->s_md_lock);
@@ -1915,84 +1954,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1915 return 0; 1954 return 0;
1916} 1955}
1917 1956
1918/*
1919 * lock the group_info alloc_sem of all the groups
1920 * belonging to the same buddy cache page. This
1921 * make sure other parallel operation on the buddy
1922 * cache doesn't happen whild holding the buddy cache
1923 * lock
1924 */
1925int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
1926{
1927 int i;
1928 int block, pnum;
1929 int blocks_per_page;
1930 int groups_per_page;
1931 ext4_group_t ngroups = ext4_get_groups_count(sb);
1932 ext4_group_t first_group;
1933 struct ext4_group_info *grp;
1934
1935 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1936 /*
1937 * the buddy cache inode stores the block bitmap
1938 * and buddy information in consecutive blocks.
1939 * So for each group we need two blocks.
1940 */
1941 block = group * 2;
1942 pnum = block / blocks_per_page;
1943 first_group = pnum * blocks_per_page / 2;
1944
1945 groups_per_page = blocks_per_page >> 1;
1946 if (groups_per_page == 0)
1947 groups_per_page = 1;
1948 /* read all groups the page covers into the cache */
1949 for (i = 0; i < groups_per_page; i++) {
1950
1951 if ((first_group + i) >= ngroups)
1952 break;
1953 grp = ext4_get_group_info(sb, first_group + i);
1954 /* take all groups write allocation
1955 * semaphore. This make sure there is
1956 * no block allocation going on in any
1957 * of that groups
1958 */
1959 down_write_nested(&grp->alloc_sem, i);
1960 }
1961 return i;
1962}
1963
1964void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1965 ext4_group_t group, int locked_group)
1966{
1967 int i;
1968 int block, pnum;
1969 int blocks_per_page;
1970 ext4_group_t first_group;
1971 struct ext4_group_info *grp;
1972
1973 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1974 /*
1975 * the buddy cache inode stores the block bitmap
1976 * and buddy information in consecutive blocks.
1977 * So for each group we need two blocks.
1978 */
1979 block = group * 2;
1980 pnum = block / blocks_per_page;
1981 first_group = pnum * blocks_per_page / 2;
1982 /* release locks on all the groups */
1983 for (i = 0; i < locked_group; i++) {
1984
1985 grp = ext4_get_group_info(sb, first_group + i);
1986 /* take all groups write allocation
1987 * semaphore. This make sure there is
1988 * no block allocation going on in any
1989 * of that groups
1990 */
1991 up_write(&grp->alloc_sem);
1992 }
1993
1994}
1995
1996static noinline_for_stack int 1957static noinline_for_stack int
1997ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1958ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1998{ 1959{
@@ -2233,15 +2194,24 @@ static const struct file_operations ext4_mb_seq_groups_fops = {
2233 .release = seq_release, 2194 .release = seq_release,
2234}; 2195};
2235 2196
2197static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2198{
2199 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2200 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2201
2202 BUG_ON(!cachep);
2203 return cachep;
2204}
2236 2205
2237/* Create and initialize ext4_group_info data for the given group. */ 2206/* Create and initialize ext4_group_info data for the given group. */
2238int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2207int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2239 struct ext4_group_desc *desc) 2208 struct ext4_group_desc *desc)
2240{ 2209{
2241 int i, len; 2210 int i;
2242 int metalen = 0; 2211 int metalen = 0;
2243 struct ext4_sb_info *sbi = EXT4_SB(sb); 2212 struct ext4_sb_info *sbi = EXT4_SB(sb);
2244 struct ext4_group_info **meta_group_info; 2213 struct ext4_group_info **meta_group_info;
2214 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2245 2215
2246 /* 2216 /*
2247 * First check if this group is the first of a reserved block. 2217 * First check if this group is the first of a reserved block.
@@ -2261,22 +2231,16 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2261 meta_group_info; 2231 meta_group_info;
2262 } 2232 }
2263 2233
2264 /*
2265 * calculate needed size. if change bb_counters size,
2266 * don't forget about ext4_mb_generate_buddy()
2267 */
2268 len = offsetof(typeof(**meta_group_info),
2269 bb_counters[sb->s_blocksize_bits + 2]);
2270
2271 meta_group_info = 2234 meta_group_info =
2272 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2235 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2273 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2236 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2274 2237
2275 meta_group_info[i] = kzalloc(len, GFP_KERNEL); 2238 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2276 if (meta_group_info[i] == NULL) { 2239 if (meta_group_info[i] == NULL) {
2277 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2240 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2278 goto exit_group_info; 2241 goto exit_group_info;
2279 } 2242 }
2243 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2280 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2244 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2281 &(meta_group_info[i]->bb_state)); 2245 &(meta_group_info[i]->bb_state));
2282 2246
@@ -2331,6 +2295,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2331 int num_meta_group_infos_max; 2295 int num_meta_group_infos_max;
2332 int array_size; 2296 int array_size;
2333 struct ext4_group_desc *desc; 2297 struct ext4_group_desc *desc;
2298 struct kmem_cache *cachep;
2334 2299
2335 /* This is the number of blocks used by GDT */ 2300 /* This is the number of blocks used by GDT */
2336 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2301 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
@@ -2363,7 +2328,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2363 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2328 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2364 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2329 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2365 * So a two level scheme suffices for now. */ 2330 * So a two level scheme suffices for now. */
2366 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2331 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2367 if (sbi->s_group_info == NULL) { 2332 if (sbi->s_group_info == NULL) {
2368 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2333 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2369 return -ENOMEM; 2334 return -ENOMEM;
@@ -2373,6 +2338,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2373 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2338 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2374 goto err_freesgi; 2339 goto err_freesgi;
2375 } 2340 }
2341 sbi->s_buddy_cache->i_ino = get_next_ino();
2376 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2342 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2377 for (i = 0; i < ngroups; i++) { 2343 for (i = 0; i < ngroups; i++) {
2378 desc = ext4_get_group_desc(sb, i, NULL); 2344 desc = ext4_get_group_desc(sb, i, NULL);
@@ -2388,8 +2354,9 @@ static int ext4_mb_init_backend(struct super_block *sb)
2388 return 0; 2354 return 0;
2389 2355
2390err_freebuddy: 2356err_freebuddy:
2357 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2391 while (i-- > 0) 2358 while (i-- > 0)
2392 kfree(ext4_get_group_info(sb, i)); 2359 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2393 i = num_meta_group_infos; 2360 i = num_meta_group_infos;
2394 while (i-- > 0) 2361 while (i-- > 0)
2395 kfree(sbi->s_group_info[i]); 2362 kfree(sbi->s_group_info[i]);
@@ -2399,6 +2366,55 @@ err_freesgi:
2399 return -ENOMEM; 2366 return -ENOMEM;
2400} 2367}
2401 2368
2369static void ext4_groupinfo_destroy_slabs(void)
2370{
2371 int i;
2372
2373 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2374 if (ext4_groupinfo_caches[i])
2375 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2376 ext4_groupinfo_caches[i] = NULL;
2377 }
2378}
2379
2380static int ext4_groupinfo_create_slab(size_t size)
2381{
2382 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2383 int slab_size;
2384 int blocksize_bits = order_base_2(size);
2385 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2386 struct kmem_cache *cachep;
2387
2388 if (cache_index >= NR_GRPINFO_CACHES)
2389 return -EINVAL;
2390
2391 if (unlikely(cache_index < 0))
2392 cache_index = 0;
2393
2394 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2395 if (ext4_groupinfo_caches[cache_index]) {
2396 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2397 return 0; /* Already created */
2398 }
2399
2400 slab_size = offsetof(struct ext4_group_info,
2401 bb_counters[blocksize_bits + 2]);
2402
2403 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2404 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2405 NULL);
2406
2407 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2408 if (!cachep) {
2409 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2410 return -ENOMEM;
2411 }
2412
2413 ext4_groupinfo_caches[cache_index] = cachep;
2414
2415 return 0;
2416}
2417
2402int ext4_mb_init(struct super_block *sb, int needs_recovery) 2418int ext4_mb_init(struct super_block *sb, int needs_recovery)
2403{ 2419{
2404 struct ext4_sb_info *sbi = EXT4_SB(sb); 2420 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2411,16 +2427,21 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2411 2427
2412 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2428 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2413 if (sbi->s_mb_offsets == NULL) { 2429 if (sbi->s_mb_offsets == NULL) {
2414 return -ENOMEM; 2430 ret = -ENOMEM;
2431 goto out;
2415 } 2432 }
2416 2433
2417 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2434 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2418 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2435 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2419 if (sbi->s_mb_maxs == NULL) { 2436 if (sbi->s_mb_maxs == NULL) {
2420 kfree(sbi->s_mb_offsets); 2437 ret = -ENOMEM;
2421 return -ENOMEM; 2438 goto out;
2422 } 2439 }
2423 2440
2441 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2442 if (ret < 0)
2443 goto out;
2444
2424 /* order 0 is regular bitmap */ 2445 /* order 0 is regular bitmap */
2425 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2446 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2426 sbi->s_mb_offsets[0] = 0; 2447 sbi->s_mb_offsets[0] = 0;
@@ -2439,9 +2460,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2439 /* init file for buddy data */ 2460 /* init file for buddy data */
2440 ret = ext4_mb_init_backend(sb); 2461 ret = ext4_mb_init_backend(sb);
2441 if (ret != 0) { 2462 if (ret != 0) {
2442 kfree(sbi->s_mb_offsets); 2463 goto out;
2443 kfree(sbi->s_mb_maxs);
2444 return ret;
2445 } 2464 }
2446 2465
2447 spin_lock_init(&sbi->s_md_lock); 2466 spin_lock_init(&sbi->s_md_lock);
@@ -2456,9 +2475,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2456 2475
2457 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2476 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2458 if (sbi->s_locality_groups == NULL) { 2477 if (sbi->s_locality_groups == NULL) {
2459 kfree(sbi->s_mb_offsets); 2478 ret = -ENOMEM;
2460 kfree(sbi->s_mb_maxs); 2479 goto out;
2461 return -ENOMEM;
2462 } 2480 }
2463 for_each_possible_cpu(i) { 2481 for_each_possible_cpu(i) {
2464 struct ext4_locality_group *lg; 2482 struct ext4_locality_group *lg;
@@ -2475,7 +2493,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2475 2493
2476 if (sbi->s_journal) 2494 if (sbi->s_journal)
2477 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2495 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2478 return 0; 2496out:
2497 if (ret) {
2498 kfree(sbi->s_mb_offsets);
2499 kfree(sbi->s_mb_maxs);
2500 }
2501 return ret;
2479} 2502}
2480 2503
2481/* need to called with the ext4 group lock held */ 2504/* need to called with the ext4 group lock held */
@@ -2503,6 +2526,7 @@ int ext4_mb_release(struct super_block *sb)
2503 int num_meta_group_infos; 2526 int num_meta_group_infos;
2504 struct ext4_group_info *grinfo; 2527 struct ext4_group_info *grinfo;
2505 struct ext4_sb_info *sbi = EXT4_SB(sb); 2528 struct ext4_sb_info *sbi = EXT4_SB(sb);
2529 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2506 2530
2507 if (sbi->s_group_info) { 2531 if (sbi->s_group_info) {
2508 for (i = 0; i < ngroups; i++) { 2532 for (i = 0; i < ngroups; i++) {
@@ -2513,7 +2537,7 @@ int ext4_mb_release(struct super_block *sb)
2513 ext4_lock_group(sb, i); 2537 ext4_lock_group(sb, i);
2514 ext4_mb_cleanup_pa(grinfo); 2538 ext4_mb_cleanup_pa(grinfo);
2515 ext4_unlock_group(sb, i); 2539 ext4_unlock_group(sb, i);
2516 kfree(grinfo); 2540 kmem_cache_free(cachep, grinfo);
2517 } 2541 }
2518 num_meta_group_infos = (ngroups + 2542 num_meta_group_infos = (ngroups +
2519 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2543 EXT4_DESC_PER_BLOCK(sb) - 1) >>
@@ -2557,20 +2581,15 @@ int ext4_mb_release(struct super_block *sb)
2557 return 0; 2581 return 0;
2558} 2582}
2559 2583
2560static inline void ext4_issue_discard(struct super_block *sb, 2584static inline int ext4_issue_discard(struct super_block *sb,
2561 ext4_group_t block_group, ext4_grpblk_t block, int count) 2585 ext4_group_t block_group, ext4_grpblk_t block, int count)
2562{ 2586{
2563 int ret;
2564 ext4_fsblk_t discard_block; 2587 ext4_fsblk_t discard_block;
2565 2588
2566 discard_block = block + ext4_group_first_block_no(sb, block_group); 2589 discard_block = block + ext4_group_first_block_no(sb, block_group);
2567 trace_ext4_discard_blocks(sb, 2590 trace_ext4_discard_blocks(sb,
2568 (unsigned long long) discard_block, count); 2591 (unsigned long long) discard_block, count);
2569 ret = sb_issue_discard(sb, discard_block, count); 2592 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2570 if (ret == EOPNOTSUPP) {
2571 ext4_warning(sb, "discard not supported, disabling");
2572 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2573 }
2574} 2593}
2575 2594
2576/* 2595/*
@@ -2594,7 +2613,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2594 2613
2595 if (test_opt(sb, DISCARD)) 2614 if (test_opt(sb, DISCARD))
2596 ext4_issue_discard(sb, entry->group, 2615 ext4_issue_discard(sb, entry->group,
2597 entry->start_blk, entry->count); 2616 entry->start_blk, entry->count);
2598 2617
2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2618 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2600 /* we expect to find existing buddy because it's pinned */ 2619 /* we expect to find existing buddy because it's pinned */
@@ -2658,28 +2677,22 @@ static void ext4_remove_debugfs_entry(void)
2658 2677
2659#endif 2678#endif
2660 2679
2661int __init init_ext4_mballoc(void) 2680int __init ext4_init_mballoc(void)
2662{ 2681{
2663 ext4_pspace_cachep = 2682 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2664 kmem_cache_create("ext4_prealloc_space", 2683 SLAB_RECLAIM_ACCOUNT);
2665 sizeof(struct ext4_prealloc_space),
2666 0, SLAB_RECLAIM_ACCOUNT, NULL);
2667 if (ext4_pspace_cachep == NULL) 2684 if (ext4_pspace_cachep == NULL)
2668 return -ENOMEM; 2685 return -ENOMEM;
2669 2686
2670 ext4_ac_cachep = 2687 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2671 kmem_cache_create("ext4_alloc_context", 2688 SLAB_RECLAIM_ACCOUNT);
2672 sizeof(struct ext4_allocation_context),
2673 0, SLAB_RECLAIM_ACCOUNT, NULL);
2674 if (ext4_ac_cachep == NULL) { 2689 if (ext4_ac_cachep == NULL) {
2675 kmem_cache_destroy(ext4_pspace_cachep); 2690 kmem_cache_destroy(ext4_pspace_cachep);
2676 return -ENOMEM; 2691 return -ENOMEM;
2677 } 2692 }
2678 2693
2679 ext4_free_ext_cachep = 2694 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2680 kmem_cache_create("ext4_free_block_extents", 2695 SLAB_RECLAIM_ACCOUNT);
2681 sizeof(struct ext4_free_data),
2682 0, SLAB_RECLAIM_ACCOUNT, NULL);
2683 if (ext4_free_ext_cachep == NULL) { 2696 if (ext4_free_ext_cachep == NULL) {
2684 kmem_cache_destroy(ext4_pspace_cachep); 2697 kmem_cache_destroy(ext4_pspace_cachep);
2685 kmem_cache_destroy(ext4_ac_cachep); 2698 kmem_cache_destroy(ext4_ac_cachep);
@@ -2689,7 +2702,7 @@ int __init init_ext4_mballoc(void)
2689 return 0; 2702 return 0;
2690} 2703}
2691 2704
2692void exit_ext4_mballoc(void) 2705void ext4_exit_mballoc(void)
2693{ 2706{
2694 /* 2707 /*
2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2708 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
@@ -2699,6 +2712,7 @@ void exit_ext4_mballoc(void)
2699 kmem_cache_destroy(ext4_pspace_cachep); 2712 kmem_cache_destroy(ext4_pspace_cachep);
2700 kmem_cache_destroy(ext4_ac_cachep); 2713 kmem_cache_destroy(ext4_ac_cachep);
2701 kmem_cache_destroy(ext4_free_ext_cachep); 2714 kmem_cache_destroy(ext4_free_ext_cachep);
2715 ext4_groupinfo_destroy_slabs();
2702 ext4_remove_debugfs_entry(); 2716 ext4_remove_debugfs_entry();
2703} 2717}
2704 2718
@@ -3135,7 +3149,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3135 cur_distance = abs(goal_block - cpa->pa_pstart); 3149 cur_distance = abs(goal_block - cpa->pa_pstart);
3136 new_distance = abs(goal_block - pa->pa_pstart); 3150 new_distance = abs(goal_block - pa->pa_pstart);
3137 3151
3138 if (cur_distance < new_distance) 3152 if (cur_distance <= new_distance)
3139 return cpa; 3153 return cpa;
3140 3154
3141 /* drop the previous reference */ 3155 /* drop the previous reference */
@@ -3535,8 +3549,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3535 */ 3549 */
3536static noinline_for_stack int 3550static noinline_for_stack int
3537ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3551ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3538 struct ext4_prealloc_space *pa, 3552 struct ext4_prealloc_space *pa)
3539 struct ext4_allocation_context *ac)
3540{ 3553{
3541 struct super_block *sb = e4b->bd_sb; 3554 struct super_block *sb = e4b->bd_sb;
3542 struct ext4_sb_info *sbi = EXT4_SB(sb); 3555 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3554,11 +3567,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3554 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3567 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3555 end = bit + pa->pa_len; 3568 end = bit + pa->pa_len;
3556 3569
3557 if (ac) {
3558 ac->ac_sb = sb;
3559 ac->ac_inode = pa->pa_inode;
3560 }
3561
3562 while (bit < end) { 3570 while (bit < end) {
3563 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3571 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3564 if (bit >= end) 3572 if (bit >= end)
@@ -3569,15 +3577,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3569 (unsigned) next - bit, (unsigned) group); 3577 (unsigned) next - bit, (unsigned) group);
3570 free += next - bit; 3578 free += next - bit;
3571 3579
3572 if (ac) { 3580 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3573 ac->ac_b_ex.fe_group = group; 3581 trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
3574 ac->ac_b_ex.fe_start = bit;
3575 ac->ac_b_ex.fe_len = next - bit;
3576 ac->ac_b_ex.fe_logical = 0;
3577 trace_ext4_mballoc_discard(ac);
3578 }
3579
3580 trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit,
3581 next - bit); 3582 next - bit);
3582 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3583 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3583 bit = next + 1; 3584 bit = next + 1;
@@ -3601,29 +3602,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3601 3602
3602static noinline_for_stack int 3603static noinline_for_stack int
3603ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3604ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3604 struct ext4_prealloc_space *pa, 3605 struct ext4_prealloc_space *pa)
3605 struct ext4_allocation_context *ac)
3606{ 3606{
3607 struct super_block *sb = e4b->bd_sb; 3607 struct super_block *sb = e4b->bd_sb;
3608 ext4_group_t group; 3608 ext4_group_t group;
3609 ext4_grpblk_t bit; 3609 ext4_grpblk_t bit;
3610 3610
3611 trace_ext4_mb_release_group_pa(sb, ac, pa); 3611 trace_ext4_mb_release_group_pa(pa);
3612 BUG_ON(pa->pa_deleted == 0); 3612 BUG_ON(pa->pa_deleted == 0);
3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3613 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3614 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3615 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3616 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3617 3617 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3618 if (ac) {
3619 ac->ac_sb = sb;
3620 ac->ac_inode = NULL;
3621 ac->ac_b_ex.fe_group = group;
3622 ac->ac_b_ex.fe_start = bit;
3623 ac->ac_b_ex.fe_len = pa->pa_len;
3624 ac->ac_b_ex.fe_logical = 0;
3625 trace_ext4_mballoc_discard(ac);
3626 }
3627 3618
3628 return 0; 3619 return 0;
3629} 3620}
@@ -3644,7 +3635,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3644 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3635 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3645 struct buffer_head *bitmap_bh = NULL; 3636 struct buffer_head *bitmap_bh = NULL;
3646 struct ext4_prealloc_space *pa, *tmp; 3637 struct ext4_prealloc_space *pa, *tmp;
3647 struct ext4_allocation_context *ac;
3648 struct list_head list; 3638 struct list_head list;
3649 struct ext4_buddy e4b; 3639 struct ext4_buddy e4b;
3650 int err; 3640 int err;
@@ -3673,9 +3663,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3673 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3663 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3674 3664
3675 INIT_LIST_HEAD(&list); 3665 INIT_LIST_HEAD(&list);
3676 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3677 if (ac)
3678 ac->ac_sb = sb;
3679repeat: 3666repeat:
3680 ext4_lock_group(sb, group); 3667 ext4_lock_group(sb, group);
3681 list_for_each_entry_safe(pa, tmp, 3668 list_for_each_entry_safe(pa, tmp,
@@ -3730,9 +3717,9 @@ repeat:
3730 spin_unlock(pa->pa_obj_lock); 3717 spin_unlock(pa->pa_obj_lock);
3731 3718
3732 if (pa->pa_type == MB_GROUP_PA) 3719 if (pa->pa_type == MB_GROUP_PA)
3733 ext4_mb_release_group_pa(&e4b, pa, ac); 3720 ext4_mb_release_group_pa(&e4b, pa);
3734 else 3721 else
3735 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3722 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3736 3723
3737 list_del(&pa->u.pa_tmp_list); 3724 list_del(&pa->u.pa_tmp_list);
3738 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3725 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
@@ -3740,8 +3727,6 @@ repeat:
3740 3727
3741out: 3728out:
3742 ext4_unlock_group(sb, group); 3729 ext4_unlock_group(sb, group);
3743 if (ac)
3744 kmem_cache_free(ext4_ac_cachep, ac);
3745 ext4_mb_unload_buddy(&e4b); 3730 ext4_mb_unload_buddy(&e4b);
3746 put_bh(bitmap_bh); 3731 put_bh(bitmap_bh);
3747 return free; 3732 return free;
@@ -3762,7 +3747,6 @@ void ext4_discard_preallocations(struct inode *inode)
3762 struct super_block *sb = inode->i_sb; 3747 struct super_block *sb = inode->i_sb;
3763 struct buffer_head *bitmap_bh = NULL; 3748 struct buffer_head *bitmap_bh = NULL;
3764 struct ext4_prealloc_space *pa, *tmp; 3749 struct ext4_prealloc_space *pa, *tmp;
3765 struct ext4_allocation_context *ac;
3766 ext4_group_t group = 0; 3750 ext4_group_t group = 0;
3767 struct list_head list; 3751 struct list_head list;
3768 struct ext4_buddy e4b; 3752 struct ext4_buddy e4b;
@@ -3778,11 +3762,6 @@ void ext4_discard_preallocations(struct inode *inode)
3778 3762
3779 INIT_LIST_HEAD(&list); 3763 INIT_LIST_HEAD(&list);
3780 3764
3781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
3782 if (ac) {
3783 ac->ac_sb = sb;
3784 ac->ac_inode = inode;
3785 }
3786repeat: 3765repeat:
3787 /* first, collect all pa's in the inode */ 3766 /* first, collect all pa's in the inode */
3788 spin_lock(&ei->i_prealloc_lock); 3767 spin_lock(&ei->i_prealloc_lock);
@@ -3852,7 +3831,7 @@ repeat:
3852 3831
3853 ext4_lock_group(sb, group); 3832 ext4_lock_group(sb, group);
3854 list_del(&pa->pa_group_list); 3833 list_del(&pa->pa_group_list);
3855 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3834 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3856 ext4_unlock_group(sb, group); 3835 ext4_unlock_group(sb, group);
3857 3836
3858 ext4_mb_unload_buddy(&e4b); 3837 ext4_mb_unload_buddy(&e4b);
@@ -3861,30 +3840,16 @@ repeat:
3861 list_del(&pa->u.pa_tmp_list); 3840 list_del(&pa->u.pa_tmp_list);
3862 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3841 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3863 } 3842 }
3864 if (ac)
3865 kmem_cache_free(ext4_ac_cachep, ac);
3866} 3843}
3867 3844
3868/*
3869 * finds all preallocated spaces and return blocks being freed to them
3870 * if preallocated space becomes full (no block is used from the space)
3871 * then the function frees space in buddy
3872 * XXX: at the moment, truncate (which is the only way to free blocks)
3873 * discards all preallocations
3874 */
3875static void ext4_mb_return_to_preallocation(struct inode *inode,
3876 struct ext4_buddy *e4b,
3877 sector_t block, int count)
3878{
3879 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
3880}
3881#ifdef CONFIG_EXT4_DEBUG 3845#ifdef CONFIG_EXT4_DEBUG
3882static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3846static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3883{ 3847{
3884 struct super_block *sb = ac->ac_sb; 3848 struct super_block *sb = ac->ac_sb;
3885 ext4_group_t ngroups, i; 3849 ext4_group_t ngroups, i;
3886 3850
3887 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 3851 if (!mb_enable_debug ||
3852 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3888 return; 3853 return;
3889 3854
3890 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3855 printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4060,14 +4025,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4060 struct ext4_buddy e4b; 4025 struct ext4_buddy e4b;
4061 struct list_head discard_list; 4026 struct list_head discard_list;
4062 struct ext4_prealloc_space *pa, *tmp; 4027 struct ext4_prealloc_space *pa, *tmp;
4063 struct ext4_allocation_context *ac;
4064 4028
4065 mb_debug(1, "discard locality group preallocation\n"); 4029 mb_debug(1, "discard locality group preallocation\n");
4066 4030
4067 INIT_LIST_HEAD(&discard_list); 4031 INIT_LIST_HEAD(&discard_list);
4068 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4069 if (ac)
4070 ac->ac_sb = sb;
4071 4032
4072 spin_lock(&lg->lg_prealloc_lock); 4033 spin_lock(&lg->lg_prealloc_lock);
4073 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4034 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
@@ -4119,15 +4080,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4119 } 4080 }
4120 ext4_lock_group(sb, group); 4081 ext4_lock_group(sb, group);
4121 list_del(&pa->pa_group_list); 4082 list_del(&pa->pa_group_list);
4122 ext4_mb_release_group_pa(&e4b, pa, ac); 4083 ext4_mb_release_group_pa(&e4b, pa);
4123 ext4_unlock_group(sb, group); 4084 ext4_unlock_group(sb, group);
4124 4085
4125 ext4_mb_unload_buddy(&e4b); 4086 ext4_mb_unload_buddy(&e4b);
4126 list_del(&pa->u.pa_tmp_list); 4087 list_del(&pa->u.pa_tmp_list);
4127 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4088 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4128 } 4089 }
4129 if (ac)
4130 kmem_cache_free(ext4_ac_cachep, ac);
4131} 4090}
4132 4091
4133/* 4092/*
@@ -4203,15 +4162,12 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4203 spin_unlock(&pa->pa_lock); 4162 spin_unlock(&pa->pa_lock);
4204 } 4163 }
4205 } 4164 }
4206 if (ac->alloc_semp)
4207 up_read(ac->alloc_semp);
4208 if (pa) { 4165 if (pa) {
4209 /* 4166 /*
4210 * We want to add the pa to the right bucket. 4167 * We want to add the pa to the right bucket.
4211 * Remove it from the list and while adding 4168 * Remove it from the list and while adding
4212 * make sure the list to which we are adding 4169 * make sure the list to which we are adding
4213 * doesn't grow big. We need to release 4170 * doesn't grow big.
4214 * alloc_semp before calling ext4_mb_add_n_trim()
4215 */ 4171 */
4216 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4172 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4217 spin_lock(pa->pa_obj_lock); 4173 spin_lock(pa->pa_obj_lock);
@@ -4273,14 +4229,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4273 * EDQUOT check, as blocks and quotas have been already 4229 * EDQUOT check, as blocks and quotas have been already
4274 * reserved when data being copied into pagecache. 4230 * reserved when data being copied into pagecache.
4275 */ 4231 */
4276 if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4232 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4277 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4233 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4278 else { 4234 else {
4279 /* Without delayed allocation we need to verify 4235 /* Without delayed allocation we need to verify
4280 * there is enough free blocks to do block allocation 4236 * there is enough free blocks to do block allocation
4281 * and verify allocation doesn't exceed the quota limits. 4237 * and verify allocation doesn't exceed the quota limits.
4282 */ 4238 */
4283 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4239 while (ar->len &&
4240 ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4284 /* let others to free the space */ 4242 /* let others to free the space */
4285 yield(); 4243 yield();
4286 ar->len = ar->len >> 1; 4244 ar->len = ar->len >> 1;
@@ -4290,9 +4248,15 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4290 return 0; 4248 return 0;
4291 } 4249 }
4292 reserv_blks = ar->len; 4250 reserv_blks = ar->len;
4293 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4251 if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4294 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4252 dquot_alloc_block_nofail(ar->inode, ar->len);
4295 ar->len--; 4253 } else {
4254 while (ar->len &&
4255 dquot_alloc_block(ar->inode, ar->len)) {
4256
4257 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258 ar->len--;
4259 }
4296 } 4260 }
4297 inquota = ar->len; 4261 inquota = ar->len;
4298 if (ar->len == 0) { 4262 if (ar->len == 0) {
@@ -4370,7 +4334,8 @@ out:
4370 if (inquota && ar->len < inquota) 4334 if (inquota && ar->len < inquota)
4371 dquot_free_block(ar->inode, inquota - ar->len); 4335 dquot_free_block(ar->inode, inquota - ar->len);
4372 if (!ar->len) { 4336 if (!ar->len) {
4373 if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) 4337 if (!ext4_test_inode_state(ar->inode,
4338 EXT4_STATE_DELALLOC_RESERVED))
4374 /* release all the reserved blocks if non delalloc */ 4339 /* release all the reserved blocks if non delalloc */
4375 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4340 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4376 reserv_blks); 4341 reserv_blks);
@@ -4483,7 +4448,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4483 * @inode: inode 4448 * @inode: inode
4484 * @block: start physical block to free 4449 * @block: start physical block to free
4485 * @count: number of blocks to count 4450 * @count: number of blocks to count
4486 * @metadata: Are these metadata blocks 4451 * @flags: flags used by ext4_free_blocks
4487 */ 4452 */
4488void ext4_free_blocks(handle_t *handle, struct inode *inode, 4453void ext4_free_blocks(handle_t *handle, struct inode *inode,
4489 struct buffer_head *bh, ext4_fsblk_t block, 4454 struct buffer_head *bh, ext4_fsblk_t block,
@@ -4491,7 +4456,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4491{ 4456{
4492 struct buffer_head *bitmap_bh = NULL; 4457 struct buffer_head *bitmap_bh = NULL;
4493 struct super_block *sb = inode->i_sb; 4458 struct super_block *sb = inode->i_sb;
4494 struct ext4_allocation_context *ac = NULL;
4495 struct ext4_group_desc *gdp; 4459 struct ext4_group_desc *gdp;
4496 unsigned long freed = 0; 4460 unsigned long freed = 0;
4497 unsigned int overflow; 4461 unsigned int overflow;
@@ -4531,6 +4495,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4531 if (!bh) 4495 if (!bh)
4532 tbh = sb_find_get_block(inode->i_sb, 4496 tbh = sb_find_get_block(inode->i_sb,
4533 block + i); 4497 block + i);
4498 if (unlikely(!tbh))
4499 continue;
4534 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4500 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4535 inode, tbh, block + i); 4501 inode, tbh, block + i);
4536 } 4502 }
@@ -4546,12 +4512,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4546 if (!ext4_should_writeback_data(inode)) 4512 if (!ext4_should_writeback_data(inode))
4547 flags |= EXT4_FREE_BLOCKS_METADATA; 4513 flags |= EXT4_FREE_BLOCKS_METADATA;
4548 4514
4549 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4550 if (ac) {
4551 ac->ac_inode = inode;
4552 ac->ac_sb = sb;
4553 }
4554
4555do_more: 4515do_more:
4556 overflow = 0; 4516 overflow = 0;
4557 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4517 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4609,12 +4569,7 @@ do_more:
4609 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4569 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4610 } 4570 }
4611#endif 4571#endif
4612 if (ac) { 4572 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4613 ac->ac_b_ex.fe_group = block_group;
4614 ac->ac_b_ex.fe_start = bit;
4615 ac->ac_b_ex.fe_len = count;
4616 trace_ext4_mballoc_free(ac);
4617 }
4618 4573
4619 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4574 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4620 if (err) 4575 if (err)
@@ -4626,7 +4581,11 @@ do_more:
4626 * blocks being freed are metadata. these blocks shouldn't 4581 * blocks being freed are metadata. these blocks shouldn't
4627 * be used until this transaction is committed 4582 * be used until this transaction is committed
4628 */ 4583 */
4629 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4584 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4585 if (!new_entry) {
4586 err = -ENOMEM;
4587 goto error_return;
4588 }
4630 new_entry->start_blk = bit; 4589 new_entry->start_blk = bit;
4631 new_entry->group = block_group; 4590 new_entry->group = block_group;
4632 new_entry->count = count; 4591 new_entry->count = count;
@@ -4643,9 +4602,6 @@ do_more:
4643 ext4_lock_group(sb, block_group); 4602 ext4_lock_group(sb, block_group);
4644 mb_clear_bits(bitmap_bh->b_data, bit, count); 4603 mb_clear_bits(bitmap_bh->b_data, bit, count);
4645 mb_free_blocks(inode, &e4b, bit, count); 4604 mb_free_blocks(inode, &e4b, bit, count);
4646 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
4647 if (test_opt(sb, DISCARD))
4648 ext4_issue_discard(sb, block_group, bit, count);
4649 } 4605 }
4650 4606
4651 ret = ext4_free_blks_count(sb, gdp) + count; 4607 ret = ext4_free_blks_count(sb, gdp) + count;
@@ -4685,7 +4641,316 @@ error_return:
4685 dquot_free_block(inode, freed); 4641 dquot_free_block(inode, freed);
4686 brelse(bitmap_bh); 4642 brelse(bitmap_bh);
4687 ext4_std_error(sb, err); 4643 ext4_std_error(sb, err);
4688 if (ac)
4689 kmem_cache_free(ext4_ac_cachep, ac);
4690 return; 4644 return;
4691} 4645}
4646
4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction
4650 * @sb: super block
4651 * @block: start physcial block to add to the block group
4652 * @count: number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count)
4658{
4659 struct buffer_head *bitmap_bh = NULL;
4660 struct buffer_head *gd_bh;
4661 ext4_group_t block_group;
4662 ext4_grpblk_t bit;
4663 unsigned int i;
4664 struct ext4_group_desc *desc;
4665 struct ext4_sb_info *sbi = EXT4_SB(sb);
4666 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /*
4676 * Check to see if we are freeing blocks across a group
4677 * boundary.
4678 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680 goto error_return;
4681
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh)
4684 goto error_return;
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc)
4687 goto error_return;
4688
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692 in_range(block + count - 1, ext4_inode_table(sb, desc),
4693 sbi->s_itb_per_group)) {
4694 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu",
4696 block, count);
4697 goto error_return;
4698 }
4699
4700 BUFFER_TRACE(bitmap_bh, "getting write access");
4701 err = ext4_journal_get_write_access(handle, bitmap_bh);
4702 if (err)
4703 goto error_return;
4704
4705 /*
4706 * We are about to modify some metadata. Call the journal APIs
4707 * to unshare ->b_data if a currently-committing transaction is
4708 * using it
4709 */
4710 BUFFER_TRACE(gd_bh, "get_write_access");
4711 err = ext4_journal_get_write_access(handle, gd_bh);
4712 if (err)
4713 goto error_return;
4714
4715 for (i = 0, blocks_freed = 0; i < count; i++) {
4716 BUFFER_TRACE(bitmap_bh, "clear bit");
4717 if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718 ext4_error(sb, "bit already cleared for block %llu",
4719 (ext4_fsblk_t)(block + i));
4720 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721 } else {
4722 blocks_freed++;
4723 }
4724 }
4725
4726 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727 if (err)
4728 goto error_return;
4729
4730 /*
4731 * need to update group_info->bb_free and bitmap
4732 * with group lock held. generate_buddy look at
4733 * them with group lock_held
4734 */
4735 ext4_lock_group(sb, block_group);
4736 mb_clear_bits(bitmap_bh->b_data, bit, count);
4737 mb_free_blocks(NULL, &e4b, bit, count);
4738 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739 ext4_free_blks_set(sb, desc, blk_free_count);
4740 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741 ext4_unlock_group(sb, block_group);
4742 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744 if (sbi->s_log_groups_per_flex) {
4745 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746 atomic_add(blocks_freed,
4747 &sbi->s_flex_groups[flex_group].free_blocks);
4748 }
4749
4750 ext4_mb_unload_buddy(&e4b);
4751
4752 /* We dirtied the bitmap block */
4753 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756 /* And the group descriptor block */
4757 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759 if (!err)
4760 err = ret;
4761
4762error_return:
4763 brelse(bitmap_bh);
4764 ext4_std_error(sb, err);
4765 return;
4766}
4767
4768/**
4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4770 * @sb: super block for the file system
4771 * @start: starting block of the free extent in the alloc. group
4772 * @count: number of blocks to TRIM
4773 * @group: alloc. group we are working with
4774 * @e4b: ext4 buddy for the group
4775 *
4776 * Trim "count" blocks starting at "start" in the "group". To assure that no
4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4778 * be called with under the group lock.
4779 */
4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4781 ext4_group_t group, struct ext4_buddy *e4b)
4782{
4783 struct ext4_free_extent ex;
4784
4785 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4786
4787 ex.fe_start = start;
4788 ex.fe_group = group;
4789 ex.fe_len = count;
4790
4791 /*
4792 * Mark blocks used, so no one can reuse them while
4793 * being trimmed.
4794 */
4795 mb_mark_used(e4b, &ex);
4796 ext4_unlock_group(sb, group);
4797 ext4_issue_discard(sb, group, start, count);
4798 ext4_lock_group(sb, group);
4799 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4800}
4801
4802/**
4803 * ext4_trim_all_free -- function to trim all free space in alloc. group
4804 * @sb: super block for file system
4805 * @e4b: ext4 buddy
4806 * @start: first group block to examine
4807 * @max: last group block to examine
4808 * @minblocks: minimum extent block count
4809 *
4810 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4811 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4812 * the extent.
4813 *
4814 *
4815 * ext4_trim_all_free walks through group's block bitmap searching for free
4816 * extents. When the free extent is found, mark it as used in group buddy
4817 * bitmap. Then issue a TRIM command on this extent and free the extent in
4818 * the group buddy bitmap. This is done until whole group is scanned.
4819 */
4820static ext4_grpblk_t
4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4822 ext4_grpblk_t start, ext4_grpblk_t max,
4823 ext4_grpblk_t minblocks)
4824{
4825 void *bitmap;
4826 ext4_grpblk_t next, count = 0;
4827 struct ext4_buddy e4b;
4828 int ret;
4829
4830 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) {
4832 ext4_error(sb, "Error in loading buddy "
4833 "information for %u", group);
4834 return ret;
4835 }
4836 bitmap = e4b.bd_bitmap;
4837
4838 ext4_lock_group(sb, group);
4839 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start;
4841
4842 while (start < max) {
4843 start = mb_find_next_zero_bit(bitmap, max, start);
4844 if (start >= max)
4845 break;
4846 next = mb_find_next_bit(bitmap, max, start);
4847
4848 if ((next - start) >= minblocks) {
4849 ext4_trim_extent(sb, start,
4850 next - start, group, &e4b);
4851 count += next - start;
4852 }
4853 start = next + 1;
4854
4855 if (fatal_signal_pending(current)) {
4856 count = -ERESTARTSYS;
4857 break;
4858 }
4859
4860 if (need_resched()) {
4861 ext4_unlock_group(sb, group);
4862 cond_resched();
4863 ext4_lock_group(sb, group);
4864 }
4865
4866 if ((e4b.bd_info->bb_free - count) < minblocks)
4867 break;
4868 }
4869 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b);
4871
4872 ext4_debug("trimmed %d blocks in the group %d\n",
4873 count, group);
4874
4875 return count;
4876}
4877
4878/**
4879 * ext4_trim_fs() -- trim ioctl handle function
4880 * @sb: superblock for filesystem
4881 * @range: fstrim_range structure
4882 *
4883 * start: First Byte to trim
4884 * len: number of Bytes to trim from start
4885 * minlen: minimum extent length in Bytes
4886 * ext4_trim_fs goes through all allocation groups containing Bytes from
4887 * start to start+len. For each such a group ext4_trim_all_free function
4888 * is invoked to trim all free space.
4889 */
4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4891{
4892 struct ext4_group_info *grp;
4893 ext4_group_t first_group, last_group;
4894 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4895 ext4_grpblk_t cnt = 0, first_block, last_block;
4896 uint64_t start, len, minlen, trimmed = 0;
4897 ext4_fsblk_t first_data_blk =
4898 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4899 int ret = 0;
4900
4901 start = range->start >> sb->s_blocksize_bits;
4902 len = range->len >> sb->s_blocksize_bits;
4903 minlen = range->minlen >> sb->s_blocksize_bits;
4904
4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4906 return -EINVAL;
4907 if (start < first_data_blk) {
4908 len -= first_data_blk - start;
4909 start = first_data_blk;
4910 }
4911
4912 /* Determine first and last group to examine based on start and len */
4913 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4914 &first_group, &first_block);
4915 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4916 &last_group, &last_block);
4917 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4918 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4919
4920 if (first_group > last_group)
4921 return -EINVAL;
4922
4923 for (group = first_group; group <= last_group; group++) {
4924 grp = ext4_get_group_info(sb, group);
4925 /* We only do this if the grp has never been initialized */
4926 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4927 ret = ext4_mb_init_group(sb, group);
4928 if (ret)
4929 break;
4930 }
4931
4932 /*
4933 * For all the groups except the last one, last block will
4934 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4935 * change it for the last group in which case start +
4936 * len < EXT4_BLOCKS_PER_GROUP(sb).
4937 */
4938 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4939 last_block = first_block + len;
4940 len -= last_block - first_block;
4941
4942 if (grp->bb_free >= minlen) {
4943 cnt = ext4_trim_all_free(sb, group, first_block,
4944 last_block, minlen);
4945 if (cnt < 0) {
4946 ret = cnt;
4947 break;
4948 }
4949 }
4950 trimmed += cnt;
4951 first_block = 0;
4952 }
4953 range->len = trimmed * sb->s_blocksize;
4954
4955 return ret;
4956}