diff options
author | Curt Wohlgemuth <curtw@google.com> | 2010-05-16 15:00:00 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2010-05-16 15:00:00 -0400 |
commit | 8a57d9d61a6e361c7bb159dda797672c1df1a691 (patch) | |
tree | 39a01022ed2294f0acc94b45554c9a292db671dc /fs | |
parent | 6d19c42b7cf81c39632b6d4dbc514e8449bcd346 (diff) |
ext4: check for a good block group before loading buddy pages
This adds a new field in ext4_group_info to cache the largest available
block range in a block group; and don't load the buddy pages until *after*
we've done a sanity check on the block group.
With large allocation requests (e.g., fallocate(), 8MiB) and relatively full
partitions, it's easy to have no block groups with a block extent large
enough to satisfy the input request length. This currently causes the loop
during cr == 0 in ext4_mb_regular_allocator() to load the buddy bitmap pages
for EVERY block group. That can be a lot of pages. The patch below allows
us to call ext4_mb_good_group() BEFORE we load the buddy pages (although we
have check again after we lock the block group).
Addresses-Google-Bug: #2578108
Addresses-Google-Bug: #2704453
Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/ext4.h | 1 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 70 |
2 files changed, 58 insertions, 13 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index bf938cf7c5f0..d266003cac3e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -1678,6 +1678,7 @@ struct ext4_group_info { | |||
1678 | ext4_grpblk_t bb_first_free; /* first free block */ | 1678 | ext4_grpblk_t bb_first_free; /* first free block */ |
1679 | ext4_grpblk_t bb_free; /* total free blocks */ | 1679 | ext4_grpblk_t bb_free; /* total free blocks */ |
1680 | ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ | 1680 | ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ |
1681 | ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ | ||
1681 | struct list_head bb_prealloc_list; | 1682 | struct list_head bb_prealloc_list; |
1682 | #ifdef DOUBLE_CHECK | 1683 | #ifdef DOUBLE_CHECK |
1683 | void *bb_bitmap; | 1684 | void *bb_bitmap; |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4f2d3a9d4e21..aa499fe11687 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, | |||
658 | } | 658 | } |
659 | } | 659 | } |
660 | 660 | ||
661 | /* | ||
662 | * Cache the order of the largest free extent we have available in this block | ||
663 | * group. | ||
664 | */ | ||
665 | static void | ||
666 | mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) | ||
667 | { | ||
668 | int i; | ||
669 | int bits; | ||
670 | |||
671 | grp->bb_largest_free_order = -1; /* uninit */ | ||
672 | |||
673 | bits = sb->s_blocksize_bits + 1; | ||
674 | for (i = bits; i >= 0; i--) { | ||
675 | if (grp->bb_counters[i] > 0) { | ||
676 | grp->bb_largest_free_order = i; | ||
677 | break; | ||
678 | } | ||
679 | } | ||
680 | } | ||
681 | |||
661 | static noinline_for_stack | 682 | static noinline_for_stack |
662 | void ext4_mb_generate_buddy(struct super_block *sb, | 683 | void ext4_mb_generate_buddy(struct super_block *sb, |
663 | void *buddy, void *bitmap, ext4_group_t group) | 684 | void *buddy, void *bitmap, ext4_group_t group) |
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, | |||
700 | */ | 721 | */ |
701 | grp->bb_free = free; | 722 | grp->bb_free = free; |
702 | } | 723 | } |
724 | mb_set_largest_free_order(sb, grp); | ||
703 | 725 | ||
704 | clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); | 726 | clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); |
705 | 727 | ||
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, | |||
725 | * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. | 747 | * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. |
726 | * So it can have information regarding groups_per_page which | 748 | * So it can have information regarding groups_per_page which |
727 | * is blocks_per_page/2 | 749 | * is blocks_per_page/2 |
750 | * | ||
751 | * Locking note: This routine takes the block group lock of all groups | ||
752 | * for this page; do not hold this lock when calling this routine! | ||
728 | */ | 753 | */ |
729 | 754 | ||
730 | static int ext4_mb_init_cache(struct page *page, char *incore) | 755 | static int ext4_mb_init_cache(struct page *page, char *incore) |
@@ -910,6 +935,11 @@ out: | |||
910 | return err; | 935 | return err; |
911 | } | 936 | } |
912 | 937 | ||
938 | /* | ||
939 | * Locking note: This routine calls ext4_mb_init_cache(), which takes the | ||
940 | * block group lock of all groups for this page; do not hold the BG lock when | ||
941 | * calling this routine! | ||
942 | */ | ||
913 | static noinline_for_stack | 943 | static noinline_for_stack |
914 | int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) | 944 | int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) |
915 | { | 945 | { |
@@ -1004,6 +1034,11 @@ err: | |||
1004 | return ret; | 1034 | return ret; |
1005 | } | 1035 | } |
1006 | 1036 | ||
1037 | /* | ||
1038 | * Locking note: This routine calls ext4_mb_init_cache(), which takes the | ||
1039 | * block group lock of all groups for this page; do not hold the BG lock when | ||
1040 | * calling this routine! | ||
1041 | */ | ||
1007 | static noinline_for_stack int | 1042 | static noinline_for_stack int |
1008 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | 1043 | ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, |
1009 | struct ext4_buddy *e4b) | 1044 | struct ext4_buddy *e4b) |
@@ -1299,6 +1334,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |||
1299 | buddy = buddy2; | 1334 | buddy = buddy2; |
1300 | } while (1); | 1335 | } while (1); |
1301 | } | 1336 | } |
1337 | mb_set_largest_free_order(sb, e4b->bd_info); | ||
1302 | mb_check_buddy(e4b); | 1338 | mb_check_buddy(e4b); |
1303 | } | 1339 | } |
1304 | 1340 | ||
@@ -1427,6 +1463,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | |||
1427 | e4b->bd_info->bb_counters[ord]++; | 1463 | e4b->bd_info->bb_counters[ord]++; |
1428 | e4b->bd_info->bb_counters[ord]++; | 1464 | e4b->bd_info->bb_counters[ord]++; |
1429 | } | 1465 | } |
1466 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); | ||
1430 | 1467 | ||
1431 | mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | 1468 | mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); |
1432 | mb_check_buddy(e4b); | 1469 | mb_check_buddy(e4b); |
@@ -1821,16 +1858,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, | |||
1821 | } | 1858 | } |
1822 | } | 1859 | } |
1823 | 1860 | ||
1861 | /* This is now called BEFORE we load the buddy bitmap. */ | ||
1824 | static int ext4_mb_good_group(struct ext4_allocation_context *ac, | 1862 | static int ext4_mb_good_group(struct ext4_allocation_context *ac, |
1825 | ext4_group_t group, int cr) | 1863 | ext4_group_t group, int cr) |
1826 | { | 1864 | { |
1827 | unsigned free, fragments; | 1865 | unsigned free, fragments; |
1828 | unsigned i, bits; | ||
1829 | int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); | 1866 | int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); |
1830 | struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); | 1867 | struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); |
1831 | 1868 | ||
1832 | BUG_ON(cr < 0 || cr >= 4); | 1869 | BUG_ON(cr < 0 || cr >= 4); |
1833 | BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); | 1870 | |
1871 | /* We only do this if the grp has never been initialized */ | ||
1872 | if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { | ||
1873 | int ret = ext4_mb_init_group(ac->ac_sb, group); | ||
1874 | if (ret) | ||
1875 | return 0; | ||
1876 | } | ||
1834 | 1877 | ||
1835 | free = grp->bb_free; | 1878 | free = grp->bb_free; |
1836 | fragments = grp->bb_fragments; | 1879 | fragments = grp->bb_fragments; |
@@ -1843,17 +1886,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, | |||
1843 | case 0: | 1886 | case 0: |
1844 | BUG_ON(ac->ac_2order == 0); | 1887 | BUG_ON(ac->ac_2order == 0); |
1845 | 1888 | ||
1889 | if (grp->bb_largest_free_order < ac->ac_2order) | ||
1890 | return 0; | ||
1891 | |||
1846 | /* Avoid using the first bg of a flexgroup for data files */ | 1892 | /* Avoid using the first bg of a flexgroup for data files */ |
1847 | if ((ac->ac_flags & EXT4_MB_HINT_DATA) && | 1893 | if ((ac->ac_flags & EXT4_MB_HINT_DATA) && |
1848 | (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && | 1894 | (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && |
1849 | ((group % flex_size) == 0)) | 1895 | ((group % flex_size) == 0)) |
1850 | return 0; | 1896 | return 0; |
1851 | 1897 | ||
1852 | bits = ac->ac_sb->s_blocksize_bits + 1; | 1898 | return 1; |
1853 | for (i = ac->ac_2order; i <= bits; i++) | ||
1854 | if (grp->bb_counters[i] > 0) | ||
1855 | return 1; | ||
1856 | break; | ||
1857 | case 1: | 1899 | case 1: |
1858 | if ((free / fragments) >= ac->ac_g_ex.fe_len) | 1900 | if ((free / fragments) >= ac->ac_g_ex.fe_len) |
1859 | return 1; | 1901 | return 1; |
@@ -2024,14 +2066,11 @@ repeat: | |||
2024 | group = ac->ac_g_ex.fe_group; | 2066 | group = ac->ac_g_ex.fe_group; |
2025 | 2067 | ||
2026 | for (i = 0; i < ngroups; group++, i++) { | 2068 | for (i = 0; i < ngroups; group++, i++) { |
2027 | struct ext4_group_info *grp; | ||
2028 | |||
2029 | if (group == ngroups) | 2069 | if (group == ngroups) |
2030 | group = 0; | 2070 | group = 0; |
2031 | 2071 | ||
2032 | /* quick check to skip empty groups */ | 2072 | /* This now checks without needing the buddy page */ |
2033 | grp = ext4_get_group_info(sb, group); | 2073 | if (!ext4_mb_good_group(ac, group, cr)) |
2034 | if (grp->bb_free == 0) | ||
2035 | continue; | 2074 | continue; |
2036 | 2075 | ||
2037 | err = ext4_mb_load_buddy(sb, group, &e4b); | 2076 | err = ext4_mb_load_buddy(sb, group, &e4b); |
@@ -2039,8 +2078,12 @@ repeat: | |||
2039 | goto out; | 2078 | goto out; |
2040 | 2079 | ||
2041 | ext4_lock_group(sb, group); | 2080 | ext4_lock_group(sb, group); |
2081 | |||
2082 | /* | ||
2083 | * We need to check again after locking the | ||
2084 | * block group | ||
2085 | */ | ||
2042 | if (!ext4_mb_good_group(ac, group, cr)) { | 2086 | if (!ext4_mb_good_group(ac, group, cr)) { |
2043 | /* someone did allocation from this group */ | ||
2044 | ext4_unlock_group(sb, group); | 2087 | ext4_unlock_group(sb, group); |
2045 | ext4_mb_unload_buddy(&e4b); | 2088 | ext4_mb_unload_buddy(&e4b); |
2046 | continue; | 2089 | continue; |
@@ -2253,6 +2296,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2253 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | 2296 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); |
2254 | init_rwsem(&meta_group_info[i]->alloc_sem); | 2297 | init_rwsem(&meta_group_info[i]->alloc_sem); |
2255 | meta_group_info[i]->bb_free_root = RB_ROOT; | 2298 | meta_group_info[i]->bb_free_root = RB_ROOT; |
2299 | meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ | ||
2256 | 2300 | ||
2257 | #ifdef DOUBLE_CHECK | 2301 | #ifdef DOUBLE_CHECK |
2258 | { | 2302 | { |