aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/mballoc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r--fs/ext4/mballoc.c117
1 files changed, 29 insertions, 88 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b0d6022eaa67..c4c430977622 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
46 * The allocation request involve request for multiple number of blocks 46 * The allocation request involve request for multiple number of blocks
47 * near to the goal(block) value specified. 47 * near to the goal(block) value specified.
48 * 48 *
49 * During initialization phase of the allocator we decide to use the group 49 * During initialization phase of the allocator we decide to use the
50 * preallocation or inode preallocation depending on the size file. The 50 * group preallocation or inode preallocation depending on the size of
51 * size of the file could be the resulting file size we would have after 51 * the file. The size of the file could be the resulting file size we
52 * allocation or the current file size which ever is larger. If the size is 52 * would have after allocation, or the current file size, which ever
53 * less that sbi->s_mb_stream_request we select the group 53 * is larger. If the size is less than sbi->s_mb_stream_request we
54 * preallocation. The default value of s_mb_stream_request is 16 54 * select to use the group preallocation. The default value of
55 * blocks. This can also be tuned via 55 * s_mb_stream_request is 16 blocks. This can also be tuned via
56 * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms 56 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
57 * of number of blocks. 57 * terms of number of blocks.
58 * 58 *
59 * The main motivation for having small file use group preallocation is to 59 * The main motivation for having small file use group preallocation is to
60 * ensure that we have small file closer in the disk. 60 * ensure that we have small files closer together on the disk.
61 * 61 *
62 * First stage the allocator looks at the inode prealloc list 62 * First stage the allocator looks at the inode prealloc list,
63 * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for 63 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
64 * this particular inode. The inode prealloc space is represented as: 64 * spaces for this particular inode. The inode prealloc space is
65 * represented as:
65 * 66 *
66 * pa_lstart -> the logical start block for this prealloc space 67 * pa_lstart -> the logical start block for this prealloc space
67 * pa_pstart -> the physical start block for this prealloc space 68 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
121 * list. In case of inode preallocation we follow a list of heuristics 122 * list. In case of inode preallocation we follow a list of heuristics
122 * based on file size. This can be found in ext4_mb_normalize_request. If 123 * based on file size. This can be found in ext4_mb_normalize_request. If
123 * we are doing a group prealloc we try to normalize the request to 124 * we are doing a group prealloc we try to normalize the request to
124 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to 125 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
125 * 512 blocks. This can be tuned via 126 * 512 blocks. This can be tuned via
126 * /proc/fs/ext4/<partition/group_prealloc. The value is represented in 127 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
127 * terms of number of blocks. If we have mounted the file system with -O 128 * terms of number of blocks. If we have mounted the file system with -O
128 * stripe=<value> option the group prealloc request is normalized to the 129 * stripe=<value> option the group prealloc request is normalized to the
129 * stripe value (sbi->s_stripe) 130 * stripe value (sbi->s_stripe)
130 * 131 *
131 * The regular allocator(using the buddy cache) support few tunables. 132 * The regular allocator(using the buddy cache) supports few tunables.
132 * 133 *
133 * /proc/fs/ext4/<partition>/min_to_scan 134 * /sys/fs/ext4/<partition>/mb_min_to_scan
134 * /proc/fs/ext4/<partition>/max_to_scan 135 * /sys/fs/ext4/<partition>/mb_max_to_scan
135 * /proc/fs/ext4/<partition>/order2_req 136 * /sys/fs/ext4/<partition>/mb_order2_req
136 * 137 *
137 * The regular allocator use buddy scan only if the request len is power of 138 * The regular allocator uses buddy scan only if the request len is power of
138 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 139 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
139 * value of s_mb_order2_reqs can be tuned via 140 * value of s_mb_order2_reqs can be tuned via
140 * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to 141 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
141 * stripe size (sbi->s_stripe), we try to search for contigous block in 142 * stripe size (sbi->s_stripe), we try to search for contigous block in
142 * stripe size. This should result in better allocation on RAID setup. If 143 * stripe size. This should result in better allocation on RAID setups. If
143 * not we search in the specific group using bitmap for best extents. The 144 * not, we search in the specific group using bitmap for best extents. The
144 * tunable min_to_scan and max_to_scan controll the behaviour here. 145 * tunable min_to_scan and max_to_scan control the behaviour here.
145 * min_to_scan indicate how long the mballoc __must__ look for a best 146 * min_to_scan indicate how long the mballoc __must__ look for a best
146 * extent and max_to_scanindicate how long the mballoc __can__ look for a 147 * extent and max_to_scan indicates how long the mballoc __can__ look for a
147 * best extent in the found extents. Searching for the blocks starts with 148 * best extent in the found extents. Searching for the blocks starts with
148 * the group specified as the goal value in allocation context via 149 * the group specified as the goal value in allocation context via
149 * ac_g_ex. Each group is first checked based on the criteria whether it 150 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
337 ext4_group_t group); 338 ext4_group_t group);
338static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 339static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
339 ext4_group_t group); 340 ext4_group_t group);
340static int ext4_mb_init_per_dev_proc(struct super_block *sb);
341static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
342static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 341static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
343 342
344 343
@@ -1978,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1978 /* 1977 /*
1979 * We search using buddy data only if the order of the request 1978 * We search using buddy data only if the order of the request
1980 * is greater than equal to the sbi_s_mb_order2_reqs 1979 * is greater than equal to the sbi_s_mb_order2_reqs
1981 * You can tune it via /proc/fs/ext4/<partition>/order2_req 1980 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1982 */ 1981 */
1983 if (i >= sbi->s_mb_order2_reqs) { 1982 if (i >= sbi->s_mb_order2_reqs) {
1984 /* 1983 /*
@@ -2753,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2753 spin_lock_init(&lg->lg_prealloc_lock); 2752 spin_lock_init(&lg->lg_prealloc_lock);
2754 } 2753 }
2755 2754
2756 ext4_mb_init_per_dev_proc(sb);
2757 ext4_mb_history_init(sb); 2755 ext4_mb_history_init(sb);
2758 2756
2759 if (sbi->s_journal) 2757 if (sbi->s_journal)
@@ -2836,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
2836 2834
2837 free_percpu(sbi->s_locality_groups); 2835 free_percpu(sbi->s_locality_groups);
2838 ext4_mb_history_release(sb); 2836 ext4_mb_history_release(sb);
2839 ext4_mb_destroy_per_dev_proc(sb);
2840 2837
2841 return 0; 2838 return 0;
2842} 2839}
@@ -2897,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2897 mb_debug("freed %u blocks in %u structures\n", count, count2); 2894 mb_debug("freed %u blocks in %u structures\n", count, count2);
2898} 2895}
2899 2896
2900#define EXT4_MB_STATS_NAME "stats"
2901#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
2902#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
2903#define EXT4_MB_ORDER2_REQ "order2_req"
2904#define EXT4_MB_STREAM_REQ "stream_req"
2905#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
2906
2907static int ext4_mb_init_per_dev_proc(struct super_block *sb)
2908{
2909#ifdef CONFIG_PROC_FS
2910 mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
2911 struct ext4_sb_info *sbi = EXT4_SB(sb);
2912 struct proc_dir_entry *proc;
2913
2914 if (sbi->s_proc == NULL)
2915 return -EINVAL;
2916
2917 EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
2918 EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
2919 EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
2920 EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
2921 EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
2922 EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
2923 return 0;
2924
2925err_out:
2926 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2927 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2928 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2929 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2930 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2931 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2932 return -ENOMEM;
2933#else
2934 return 0;
2935#endif
2936}
2937
2938static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
2939{
2940#ifdef CONFIG_PROC_FS
2941 struct ext4_sb_info *sbi = EXT4_SB(sb);
2942
2943 if (sbi->s_proc == NULL)
2944 return -EINVAL;
2945
2946 remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
2947 remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
2948 remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
2949 remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
2950 remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
2951 remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
2952#endif
2953 return 0;
2954}
2955
2956int __init init_ext4_mballoc(void) 2897int __init init_ext4_mballoc(void)
2957{ 2898{
2958 ext4_pspace_cachep = 2899 ext4_pspace_cachep =
@@ -3123,7 +3064,7 @@ out_err:
3123 * here we normalize request for locality group 3064 * here we normalize request for locality group
3124 * Group request are normalized to s_strip size if we set the same via mount 3065 * Group request are normalized to s_strip size if we set the same via mount
3125 * option. If not we set it to s_mb_group_prealloc which can be configured via 3066 * option. If not we set it to s_mb_group_prealloc which can be configured via
3126 * /proc/fs/ext4/<partition>/group_prealloc 3067 * /sys/fs/ext4/<partition>/mb_group_prealloc
3127 * 3068 *
3128 * XXX: should we try to preallocate more than the group has now? 3069 * XXX: should we try to preallocate more than the group has now?
3129 */ 3070 */
@@ -4239,7 +4180,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4239 * file is determined by the current size or the resulting size after 4180 * file is determined by the current size or the resulting size after
4240 * allocation which ever is larger 4181 * allocation which ever is larger
4241 * 4182 *
4242 * One can tune this size via /proc/fs/ext4/<partition>/stream_req 4183 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4243 */ 4184 */
4244static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 4185static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4245{ 4186{