aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c429
1 files changed, 224 insertions, 205 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2156 2157
2157 if (v == SEQ_START_TOKEN) { 2158 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found", 2161 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0; 2163 return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2164 2165
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n"; 2168 "0x%04x %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical); 2171 hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{ 2206{
2206} 2207}
2207 2208
2208static struct seq_operations ext4_mb_seq_history_ops = { 2209static const struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start, 2210 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next, 2211 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop, 2212 .stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
2287 return count; 2288 return count;
2288} 2289}
2289 2290
2290static struct file_operations ext4_mb_seq_history_fops = { 2291static const struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE, 2292 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open, 2293 .open = ext4_mb_seq_history_open,
2293 .read = seq_read, 2294 .read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2329 struct ext4_buddy e4b;
2329 struct sg { 2330 struct sg {
2330 struct ext4_group_info info; 2331 struct ext4_group_info info;
2331 unsigned short counters[16]; 2332 ext4_grpblk_t counters[16];
2332 } sg; 2333 } sg;
2333 2334
2334 group--; 2335 group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2367{
2367} 2368}
2368 2369
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2370static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2371 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2372 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2373 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2388
2388} 2389}
2389 2390
2390static struct file_operations ext4_mb_seq_groups_fops = { 2391static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2392 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2393 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2394 .read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2533
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2537
2537#ifdef DOUBLE_CHECK 2538#ifdef DOUBLE_CHECK
2538 { 2539 {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2559 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2560} /* ext4_mb_add_groupinfo */
2560 2561
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2562static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2563{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2565 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2566 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2567 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2568 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2569 int num_meta_group_infos_max;
2579 int array_size; 2570 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2571 struct ext4_group_desc *desc;
2582 2572
2583 /* This is the number of blocks used by GDT */ 2573 /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2612 goto err_freesgi;
2623 } 2613 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2614 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2615 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2616 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2617 if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
2655 while (i-- > 0) 2629 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2630 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2631 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2632 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2633 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2634 iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2645 unsigned max;
2673 int ret; 2646 int ret;
2674 2647
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2648 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2649
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2650 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2651 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2652 return -ENOMEM;
2680 } 2653 }
2681 2654
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2655 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2656 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2657 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2658 kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2731 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2732 }
2760 if (count) 2733 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2734 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2735
2763} 2736}
2764 2737
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2812 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2813 entry = list_entry(l, struct ext4_free_data, list);
2841 2814
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2815 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2816 entry->count, entry->group, entry);
2844 2817
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2818 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2847 ext4_mb_release_desc(&e4b);
2875 } 2848 }
2876 2849
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2850 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853#ifdef CONFIG_EXT4_DEBUG
2854u8 mb_enable_debug __read_mostly;
2855
2856static struct dentry *debugfs_dir;
2857static struct dentry *debugfs_debug;
2858
2859static void __init ext4_create_debugfs_entry(void)
2860{
2861 debugfs_dir = debugfs_create_dir("ext4", NULL);
2862 if (debugfs_dir)
2863 debugfs_debug = debugfs_create_u8("mballoc-debug",
2864 S_IRUGO | S_IWUSR,
2865 debugfs_dir,
2866 &mb_enable_debug);
2867}
2868
2869static void ext4_remove_debugfs_entry(void)
2870{
2871 debugfs_remove(debugfs_debug);
2872 debugfs_remove(debugfs_dir);
2878} 2873}
2879 2874
2875#else
2876
2877static void __init ext4_create_debugfs_entry(void)
2878{
2879}
2880
2881static void ext4_remove_debugfs_entry(void)
2882{
2883}
2884
2885#endif
2886
2880int __init init_ext4_mballoc(void) 2887int __init init_ext4_mballoc(void)
2881{ 2888{
2882 ext4_pspace_cachep = 2889 ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2911 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2912 return -ENOMEM;
2906 } 2913 }
2914 ext4_create_debugfs_entry();
2907 return 0; 2915 return 0;
2908} 2916}
2909 2917
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2925 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2926 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2927 kmem_cache_destroy(ext4_free_ext_cachep);
2928 ext4_remove_debugfs_entry();
2920} 2929}
2921 2930
2922 2931
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3070 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 3071 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3072 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 3073 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 3074 current->pid, ac->ac_g_ex.fe_len);
3066} 3075}
3067 3076
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3189 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3190 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 3191
3183 /* skip PA normalized request doesn't overlap with */ 3192 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 3193 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 3194 spin_unlock(&pa->pa_lock);
3190 continue; 3195 continue;
3191 } 3196 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3197 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 3198
3199 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3200 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 3201 BUG_ON(pa_end < start);
3196 start = pa_end; 3202 start = pa_end;
3197 } 3203 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 3204 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 3205 end = pa->pa_lstart;
3202 } 3206 }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3255 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 3256 }
3253 3257
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3258 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 3259 (unsigned) orig_size, (unsigned) start);
3256} 3260}
3257 3261
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3304 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3305 pa->pa_free -= len;
3302 3306
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3307 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3308}
3305 3309
3306/* 3310/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3328 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3329 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3330 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3331 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3332}
3329 3333
3330/* 3334/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3386 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3387 continue;
3384 3388
3389 /* non-extent files can't have physical blocks past 2^32 */
3390 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3391 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3392 continue;
3393
3385 /* found preallocated blocks, use them */ 3394 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3396 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3512 preallocated += len;
3504 count++; 3513 count++;
3505 } 3514 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3515 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3516}
3508 3517
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3518static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3647 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3648 pa->pa_type = MB_INODE_PA;
3640 3649
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3650 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3651 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3652 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3653
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3707 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3708 pa->pa_type = MB_GROUP_PA;
3700 3709
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3710 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3712 trace_ext4_mb_new_group_pa(ac, pa);
3704 3713
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3786 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3787 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3788 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3789 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3790 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3791 (unsigned) group);
3783 free += next - bit; 3792 free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3877 int busy = 0;
3869 int free = 0; 3878 int free = 0;
3870 3879
3871 mb_debug("discard preallocation for group %u\n", group); 3880 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3881
3873 if (list_empty(&grp->bb_prealloc_list)) 3882 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3883 return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 4001 return;
3993 } 4002 }
3994 4003
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4004 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 4005 trace_ext4_discard_preallocations(inode);
3997 4006
3998 INIT_LIST_HEAD(&list); 4007 INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 4106{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4107 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 4108}
4100#ifdef MB_DEBUG 4109#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4110static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 4111{
4103 struct super_block *sb = ac->ac_sb; 4112 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4148 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 4149 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 4150 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4151 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 4152 start, pa->pa_len);
4144 } 4153 }
4145 ext4_unlock_group(sb, i); 4154 ext4_unlock_group(sb, i);
4146 4155
4147 if (grp->bb_free == 0) 4156 if (grp->bb_free == 0)
4148 continue; 4157 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 4158 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 4159 i, grp->bb_free, grp->bb_fragments);
4151 } 4160 }
4152 printk(KERN_ERR "\n"); 4161 printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4183 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 4184 return;
4176 4185
4186 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4187 return;
4188
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits;
4179 size = max(size, isize); 4192 size = max(size, isize);
4180 4193
4181 /* don't use group allocation for large files */ 4194 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 4195 !ext4_fs_is_busy(sbi) &&
4196 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4197 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 4198 return;
4199 }
4184 4200
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4201 /* don't use group allocation for large files */
4202 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 4204 return;
4205 }
4187 4206
4188 BUG_ON(ac->ac_lg != NULL); 4207 BUG_ON(ac->ac_lg != NULL);
4189 /* 4208 /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 4265 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 4266 ext4_mb_group_or_file(ac);
4248 4267
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4268 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 4269 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 4270 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4271 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4287 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4288 struct ext4_allocation_context *ac;
4270 4289
4271 mb_debug("discard locality group preallocation\n"); 4290 mb_debug(1, "discard locality group preallocation\n");
4272 4291
4273 INIT_LIST_HEAD(&discard_list); 4292 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4293 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);