diff options
author | Alex Tomas <alex@clusterfs.com> | 2008-01-29 00:19:52 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2008-01-29 00:19:52 -0500 |
commit | c9de560ded61faa5b754137b7753da252391c55a (patch) | |
tree | 2c4311377c4aa72450e27f531e198fe3e1c67db0 /fs/ext4/mballoc.c | |
parent | 1988b51e476bd097d910c9245b53f2e38aedaf0d (diff) |
ext4: Add multi block allocator for ext4
Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Andreas Dilger <adilger@clusterfs.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r-- | fs/ext4/mballoc.c | 4552 |
1 files changed, 4552 insertions, 0 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c new file mode 100644 index 000000000000..76e5fedc0a0b --- /dev/null +++ b/fs/ext4/mballoc.c | |||
@@ -0,0 +1,4552 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com | ||
3 | * Written by Alex Tomas <alex@clusterfs.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify | ||
6 | * it under the terms of the GNU General Public License version 2 as | ||
7 | * published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public Licens | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- | ||
17 | */ | ||
18 | |||
19 | |||
20 | /* | ||
21 | * mballoc.c contains the multiblocks allocation routines | ||
22 | */ | ||
23 | |||
24 | #include <linux/time.h> | ||
25 | #include <linux/fs.h> | ||
26 | #include <linux/namei.h> | ||
27 | #include <linux/ext4_jbd2.h> | ||
28 | #include <linux/ext4_fs.h> | ||
29 | #include <linux/quotaops.h> | ||
30 | #include <linux/buffer_head.h> | ||
31 | #include <linux/module.h> | ||
32 | #include <linux/swap.h> | ||
33 | #include <linux/proc_fs.h> | ||
34 | #include <linux/pagemap.h> | ||
35 | #include <linux/seq_file.h> | ||
36 | #include <linux/version.h> | ||
37 | #include "group.h" | ||
38 | |||
39 | /* | ||
40 | * MUSTDO: | ||
41 | * - test ext4_ext_search_left() and ext4_ext_search_right() | ||
42 | * - search for metadata in few groups | ||
43 | * | ||
44 | * TODO v4: | ||
45 | * - normalization should take into account whether file is still open | ||
46 | * - discard preallocations if no free space left (policy?) | ||
47 | * - don't normalize tails | ||
48 | * - quota | ||
49 | * - reservation for superuser | ||
50 | * | ||
51 | * TODO v3: | ||
52 | * - bitmap read-ahead (proposed by Oleg Drokin aka green) | ||
53 | * - track min/max extents in each group for better group selection | ||
54 | * - mb_mark_used() may allocate chunk right after splitting buddy | ||
55 | * - tree of groups sorted by number of free blocks | ||
56 | * - error handling | ||
57 | */ | ||
58 | |||
59 | /* | ||
60 | * The allocation request involve request for multiple number of blocks | ||
61 | * near to the goal(block) value specified. | ||
62 | * | ||
63 | * During initialization phase of the allocator we decide to use the group | ||
64 | * preallocation or inode preallocation depending on the size file. The | ||
65 | * size of the file could be the resulting file size we would have after | ||
66 | * allocation or the current file size which ever is larger. If the size is | ||
67 | * less that sbi->s_mb_stream_request we select the group | ||
68 | * preallocation. The default value of s_mb_stream_request is 16 | ||
69 | * blocks. This can also be tuned via | ||
70 | * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms | ||
71 | * of number of blocks. | ||
72 | * | ||
73 | * The main motivation for having small file use group preallocation is to | ||
74 | * ensure that we have small file closer in the disk. | ||
75 | * | ||
76 | * First stage the allocator looks at the inode prealloc list | ||
77 | * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for | ||
78 | * this particular inode. The inode prealloc space is represented as: | ||
79 | * | ||
80 | * pa_lstart -> the logical start block for this prealloc space | ||
81 | * pa_pstart -> the physical start block for this prealloc space | ||
82 | * pa_len -> lenght for this prealloc space | ||
83 | * pa_free -> free space available in this prealloc space | ||
84 | * | ||
85 | * The inode preallocation space is used looking at the _logical_ start | ||
86 | * block. If only the logical file block falls within the range of prealloc | ||
87 | * space we will consume the particular prealloc space. This make sure that | ||
88 | * that the we have contiguous physical blocks representing the file blocks | ||
89 | * | ||
90 | * The important thing to be noted in case of inode prealloc space is that | ||
91 | * we don't modify the values associated to inode prealloc space except | ||
92 | * pa_free. | ||
93 | * | ||
94 | * If we are not able to find blocks in the inode prealloc space and if we | ||
95 | * have the group allocation flag set then we look at the locality group | ||
96 | * prealloc space. These are per CPU prealloc list repreasented as | ||
97 | * | ||
98 | * ext4_sb_info.s_locality_groups[smp_processor_id()] | ||
99 | * | ||
100 | * The reason for having a per cpu locality group is to reduce the contention | ||
101 | * between CPUs. It is possible to get scheduled at this point. | ||
102 | * | ||
103 | * The locality group prealloc space is used looking at whether we have | ||
104 | * enough free space (pa_free) withing the prealloc space. | ||
105 | * | ||
106 | * If we can't allocate blocks via inode prealloc or/and locality group | ||
107 | * prealloc then we look at the buddy cache. The buddy cache is represented | ||
108 | * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets | ||
109 | * mapped to the buddy and bitmap information regarding different | ||
110 | * groups. The buddy information is attached to buddy cache inode so that | ||
111 | * we can access them through the page cache. The information regarding | ||
112 | * each group is loaded via ext4_mb_load_buddy. The information involve | ||
113 | * block bitmap and buddy information. The information are stored in the | ||
114 | * inode as: | ||
115 | * | ||
116 | * { page } | ||
117 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
118 | * | ||
119 | * | ||
120 | * one block each for bitmap and buddy information. So for each group we | ||
121 | * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / | ||
122 | * blocksize) blocks. So it can have information regarding groups_per_page | ||
123 | * which is blocks_per_page/2 | ||
124 | * | ||
125 | * The buddy cache inode is not stored on disk. The inode is thrown | ||
126 | * away when the filesystem is unmounted. | ||
127 | * | ||
128 | * We look for count number of blocks in the buddy cache. If we were able | ||
129 | * to locate that many free blocks we return with additional information | ||
130 | * regarding rest of the contiguous physical block available | ||
131 | * | ||
132 | * Before allocating blocks via buddy cache we normalize the request | ||
133 | * blocks. This ensure we ask for more blocks that we needed. The extra | ||
134 | * blocks that we get after allocation is added to the respective prealloc | ||
135 | * list. In case of inode preallocation we follow a list of heuristics | ||
136 | * based on file size. This can be found in ext4_mb_normalize_request. If | ||
137 | * we are doing a group prealloc we try to normalize the request to | ||
138 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to | ||
139 | * 512 blocks. This can be tuned via | ||
140 | * /proc/fs/ext4/<partition/group_prealloc. The value is represented in | ||
141 | * terms of number of blocks. If we have mounted the file system with -O | ||
142 | * stripe=<value> option the group prealloc request is normalized to the | ||
143 | * stripe value (sbi->s_stripe) | ||
144 | * | ||
145 | * The regular allocator(using the buddy cache) support few tunables. | ||
146 | * | ||
147 | * /proc/fs/ext4/<partition>/min_to_scan | ||
148 | * /proc/fs/ext4/<partition>/max_to_scan | ||
149 | * /proc/fs/ext4/<partition>/order2_req | ||
150 | * | ||
151 | * The regular allocator use buddy scan only if the request len is power of | ||
152 | * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The | ||
153 | * value of s_mb_order2_reqs can be tuned via | ||
154 | * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to | ||
155 | * stripe size (sbi->s_stripe), we try to search for contigous block in | ||
156 | * stripe size. This should result in better allocation on RAID setup. If | ||
157 | * not we search in the specific group using bitmap for best extents. The | ||
158 | * tunable min_to_scan and max_to_scan controll the behaviour here. | ||
159 | * min_to_scan indicate how long the mballoc __must__ look for a best | ||
160 | * extent and max_to_scanindicate how long the mballoc __can__ look for a | ||
161 | * best extent in the found extents. Searching for the blocks starts with | ||
162 | * the group specified as the goal value in allocation context via | ||
163 | * ac_g_ex. Each group is first checked based on the criteria whether it | ||
164 | * can used for allocation. ext4_mb_good_group explains how the groups are | ||
165 | * checked. | ||
166 | * | ||
167 | * Both the prealloc space are getting populated as above. So for the first | ||
168 | * request we will hit the buddy cache which will result in this prealloc | ||
169 | * space getting filled. The prealloc space is then later used for the | ||
170 | * subsequent request. | ||
171 | */ | ||
172 | |||
173 | /* | ||
174 | * mballoc operates on the following data: | ||
175 | * - on-disk bitmap | ||
176 | * - in-core buddy (actually includes buddy and bitmap) | ||
177 | * - preallocation descriptors (PAs) | ||
178 | * | ||
179 | * there are two types of preallocations: | ||
180 | * - inode | ||
181 | * assiged to specific inode and can be used for this inode only. | ||
182 | * it describes part of inode's space preallocated to specific | ||
183 | * physical blocks. any block from that preallocated can be used | ||
184 | * independent. the descriptor just tracks number of blocks left | ||
185 | * unused. so, before taking some block from descriptor, one must | ||
186 | * make sure corresponded logical block isn't allocated yet. this | ||
187 | * also means that freeing any block within descriptor's range | ||
188 | * must discard all preallocated blocks. | ||
189 | * - locality group | ||
190 | * assigned to specific locality group which does not translate to | ||
191 | * permanent set of inodes: inode can join and leave group. space | ||
192 | * from this type of preallocation can be used for any inode. thus | ||
193 | * it's consumed from the beginning to the end. | ||
194 | * | ||
195 | * relation between them can be expressed as: | ||
196 | * in-core buddy = on-disk bitmap + preallocation descriptors | ||
197 | * | ||
198 | * this mean blocks mballoc considers used are: | ||
199 | * - allocated blocks (persistent) | ||
200 | * - preallocated blocks (non-persistent) | ||
201 | * | ||
202 | * consistency in mballoc world means that at any time a block is either | ||
203 | * free or used in ALL structures. notice: "any time" should not be read | ||
204 | * literally -- time is discrete and delimited by locks. | ||
205 | * | ||
206 | * to keep it simple, we don't use block numbers, instead we count number of | ||
207 | * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. | ||
208 | * | ||
209 | * all operations can be expressed as: | ||
210 | * - init buddy: buddy = on-disk + PAs | ||
211 | * - new PA: buddy += N; PA = N | ||
212 | * - use inode PA: on-disk += N; PA -= N | ||
213 | * - discard inode PA buddy -= on-disk - PA; PA = 0 | ||
214 | * - use locality group PA on-disk += N; PA -= N | ||
215 | * - discard locality group PA buddy -= PA; PA = 0 | ||
216 | * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap | ||
217 | * is used in real operation because we can't know actual used | ||
218 | * bits from PA, only from on-disk bitmap | ||
219 | * | ||
220 | * if we follow this strict logic, then all operations above should be atomic. | ||
221 | * given some of them can block, we'd have to use something like semaphores | ||
222 | * killing performance on high-end SMP hardware. let's try to relax it using | ||
223 | * the following knowledge: | ||
224 | * 1) if buddy is referenced, it's already initialized | ||
225 | * 2) while block is used in buddy and the buddy is referenced, | ||
226 | * nobody can re-allocate that block | ||
227 | * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has | ||
228 | * bit set and PA claims same block, it's OK. IOW, one can set bit in | ||
229 | * on-disk bitmap if buddy has same bit set or/and PA covers corresponded | ||
230 | * block | ||
231 | * | ||
232 | * so, now we're building a concurrency table: | ||
233 | * - init buddy vs. | ||
234 | * - new PA | ||
235 | * blocks for PA are allocated in the buddy, buddy must be referenced | ||
236 | * until PA is linked to allocation group to avoid concurrent buddy init | ||
237 | * - use inode PA | ||
238 | * we need to make sure that either on-disk bitmap or PA has uptodate data | ||
239 | * given (3) we care that PA-=N operation doesn't interfere with init | ||
240 | * - discard inode PA | ||
241 | * the simplest way would be to have buddy initialized by the discard | ||
242 | * - use locality group PA | ||
243 | * again PA-=N must be serialized with init | ||
244 | * - discard locality group PA | ||
245 | * the simplest way would be to have buddy initialized by the discard | ||
246 | * - new PA vs. | ||
247 | * - use inode PA | ||
248 | * i_data_sem serializes them | ||
249 | * - discard inode PA | ||
250 | * discard process must wait until PA isn't used by another process | ||
251 | * - use locality group PA | ||
252 | * some mutex should serialize them | ||
253 | * - discard locality group PA | ||
254 | * discard process must wait until PA isn't used by another process | ||
255 | * - use inode PA | ||
256 | * - use inode PA | ||
257 | * i_data_sem or another mutex should serializes them | ||
258 | * - discard inode PA | ||
259 | * discard process must wait until PA isn't used by another process | ||
260 | * - use locality group PA | ||
261 | * nothing wrong here -- they're different PAs covering different blocks | ||
262 | * - discard locality group PA | ||
263 | * discard process must wait until PA isn't used by another process | ||
264 | * | ||
265 | * now we're ready to make few consequences: | ||
266 | * - PA is referenced and while it is no discard is possible | ||
267 | * - PA is referenced until block isn't marked in on-disk bitmap | ||
268 | * - PA changes only after on-disk bitmap | ||
269 | * - discard must not compete with init. either init is done before | ||
270 | * any discard or they're serialized somehow | ||
271 | * - buddy init as sum of on-disk bitmap and PAs is done atomically | ||
272 | * | ||
273 | * a special case when we've used PA to emptiness. no need to modify buddy | ||
274 | * in this case, but we should care about concurrent init | ||
275 | * | ||
276 | */ | ||
277 | |||
278 | /* | ||
279 | * Logic in few words: | ||
280 | * | ||
281 | * - allocation: | ||
282 | * load group | ||
283 | * find blocks | ||
284 | * mark bits in on-disk bitmap | ||
285 | * release group | ||
286 | * | ||
287 | * - use preallocation: | ||
288 | * find proper PA (per-inode or group) | ||
289 | * load group | ||
290 | * mark bits in on-disk bitmap | ||
291 | * release group | ||
292 | * release PA | ||
293 | * | ||
294 | * - free: | ||
295 | * load group | ||
296 | * mark bits in on-disk bitmap | ||
297 | * release group | ||
298 | * | ||
299 | * - discard preallocations in group: | ||
300 | * mark PAs deleted | ||
301 | * move them onto local list | ||
302 | * load on-disk bitmap | ||
303 | * load group | ||
304 | * remove PA from object (inode or locality group) | ||
305 | * mark free blocks in-core | ||
306 | * | ||
307 | * - discard inode's preallocations: | ||
308 | */ | ||
309 | |||
310 | /* | ||
311 | * Locking rules | ||
312 | * | ||
313 | * Locks: | ||
314 | * - bitlock on a group (group) | ||
315 | * - object (inode/locality) (object) | ||
316 | * - per-pa lock (pa) | ||
317 | * | ||
318 | * Paths: | ||
319 | * - new pa | ||
320 | * object | ||
321 | * group | ||
322 | * | ||
323 | * - find and use pa: | ||
324 | * pa | ||
325 | * | ||
326 | * - release consumed pa: | ||
327 | * pa | ||
328 | * group | ||
329 | * object | ||
330 | * | ||
331 | * - generate in-core bitmap: | ||
332 | * group | ||
333 | * pa | ||
334 | * | ||
335 | * - discard all for given object (inode, locality group): | ||
336 | * object | ||
337 | * pa | ||
338 | * group | ||
339 | * | ||
340 | * - discard all for given group: | ||
341 | * group | ||
342 | * pa | ||
343 | * group | ||
344 | * object | ||
345 | * | ||
346 | */ | ||
347 | |||
348 | /* | ||
349 | * with AGGRESSIVE_CHECK allocator runs consistency checks over | ||
350 | * structures. these checks slow things down a lot | ||
351 | */ | ||
352 | #define AGGRESSIVE_CHECK__ | ||
353 | |||
354 | /* | ||
355 | * with DOUBLE_CHECK defined mballoc creates persistent in-core | ||
356 | * bitmaps, maintains and uses them to check for double allocations | ||
357 | */ | ||
358 | #define DOUBLE_CHECK__ | ||
359 | |||
360 | /* | ||
361 | */ | ||
362 | #define MB_DEBUG__ | ||
363 | #ifdef MB_DEBUG | ||
364 | #define mb_debug(fmt, a...) printk(fmt, ##a) | ||
365 | #else | ||
366 | #define mb_debug(fmt, a...) | ||
367 | #endif | ||
368 | |||
369 | /* | ||
370 | * with EXT4_MB_HISTORY mballoc stores last N allocations in memory | ||
371 | * and you can monitor it in /proc/fs/ext4/<dev>/mb_history | ||
372 | */ | ||
373 | #define EXT4_MB_HISTORY | ||
374 | #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ | ||
375 | #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ | ||
376 | #define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ | ||
377 | #define EXT4_MB_HISTORY_FREE 8 /* free */ | ||
378 | |||
379 | #define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ | ||
380 | EXT4_MB_HISTORY_PREALLOC) | ||
381 | |||
382 | /* | ||
383 | * How long mballoc can look for a best extent (in found extents) | ||
384 | */ | ||
385 | #define MB_DEFAULT_MAX_TO_SCAN 200 | ||
386 | |||
387 | /* | ||
388 | * How long mballoc must look for a best extent | ||
389 | */ | ||
390 | #define MB_DEFAULT_MIN_TO_SCAN 10 | ||
391 | |||
392 | /* | ||
393 | * How many groups mballoc will scan looking for the best chunk | ||
394 | */ | ||
395 | #define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 | ||
396 | |||
397 | /* | ||
398 | * with 'ext4_mb_stats' allocator will collect stats that will be | ||
399 | * shown at umount. The collecting costs though! | ||
400 | */ | ||
401 | #define MB_DEFAULT_STATS 1 | ||
402 | |||
403 | /* | ||
404 | * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served | ||
405 | * by the stream allocator, which purpose is to pack requests | ||
406 | * as close each to other as possible to produce smooth I/O traffic | ||
407 | * We use locality group prealloc space for stream request. | ||
408 | * We can tune the same via /proc/fs/ext4/<parition>/stream_req | ||
409 | */ | ||
410 | #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ | ||
411 | |||
412 | /* | ||
413 | * for which requests use 2^N search using buddies | ||
414 | */ | ||
415 | #define MB_DEFAULT_ORDER2_REQS 2 | ||
416 | |||
417 | /* | ||
418 | * default group prealloc size 512 blocks | ||
419 | */ | ||
420 | #define MB_DEFAULT_GROUP_PREALLOC 512 | ||
421 | |||
422 | static struct kmem_cache *ext4_pspace_cachep; | ||
423 | |||
424 | #ifdef EXT4_BB_MAX_BLOCKS | ||
425 | #undef EXT4_BB_MAX_BLOCKS | ||
426 | #endif | ||
427 | #define EXT4_BB_MAX_BLOCKS 30 | ||
428 | |||
429 | struct ext4_free_metadata { | ||
430 | ext4_group_t group; | ||
431 | unsigned short num; | ||
432 | ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; | ||
433 | struct list_head list; | ||
434 | }; | ||
435 | |||
436 | struct ext4_group_info { | ||
437 | unsigned long bb_state; | ||
438 | unsigned long bb_tid; | ||
439 | struct ext4_free_metadata *bb_md_cur; | ||
440 | unsigned short bb_first_free; | ||
441 | unsigned short bb_free; | ||
442 | unsigned short bb_fragments; | ||
443 | struct list_head bb_prealloc_list; | ||
444 | #ifdef DOUBLE_CHECK | ||
445 | void *bb_bitmap; | ||
446 | #endif | ||
447 | unsigned short bb_counters[]; | ||
448 | }; | ||
449 | |||
450 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | ||
451 | #define EXT4_GROUP_INFO_LOCKED_BIT 1 | ||
452 | |||
453 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | ||
454 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | ||
455 | |||
456 | |||
457 | struct ext4_prealloc_space { | ||
458 | struct list_head pa_inode_list; | ||
459 | struct list_head pa_group_list; | ||
460 | union { | ||
461 | struct list_head pa_tmp_list; | ||
462 | struct rcu_head pa_rcu; | ||
463 | } u; | ||
464 | spinlock_t pa_lock; | ||
465 | atomic_t pa_count; | ||
466 | unsigned pa_deleted; | ||
467 | ext4_fsblk_t pa_pstart; /* phys. block */ | ||
468 | ext4_lblk_t pa_lstart; /* log. block */ | ||
469 | unsigned short pa_len; /* len of preallocated chunk */ | ||
470 | unsigned short pa_free; /* how many blocks are free */ | ||
471 | unsigned short pa_linear; /* consumed in one direction | ||
472 | * strictly, for grp prealloc */ | ||
473 | spinlock_t *pa_obj_lock; | ||
474 | struct inode *pa_inode; /* hack, for history only */ | ||
475 | }; | ||
476 | |||
477 | |||
478 | struct ext4_free_extent { | ||
479 | ext4_lblk_t fe_logical; | ||
480 | ext4_grpblk_t fe_start; | ||
481 | ext4_group_t fe_group; | ||
482 | int fe_len; | ||
483 | }; | ||
484 | |||
485 | /* | ||
486 | * Locality group: | ||
487 | * we try to group all related changes together | ||
488 | * so that writeback can flush/allocate them together as well | ||
489 | */ | ||
490 | struct ext4_locality_group { | ||
491 | /* for allocator */ | ||
492 | struct mutex lg_mutex; /* to serialize allocates */ | ||
493 | struct list_head lg_prealloc_list;/* list of preallocations */ | ||
494 | spinlock_t lg_prealloc_lock; | ||
495 | }; | ||
496 | |||
497 | struct ext4_allocation_context { | ||
498 | struct inode *ac_inode; | ||
499 | struct super_block *ac_sb; | ||
500 | |||
501 | /* original request */ | ||
502 | struct ext4_free_extent ac_o_ex; | ||
503 | |||
504 | /* goal request (after normalization) */ | ||
505 | struct ext4_free_extent ac_g_ex; | ||
506 | |||
507 | /* the best found extent */ | ||
508 | struct ext4_free_extent ac_b_ex; | ||
509 | |||
510 | /* copy of the bext found extent taken before preallocation efforts */ | ||
511 | struct ext4_free_extent ac_f_ex; | ||
512 | |||
513 | /* number of iterations done. we have to track to limit searching */ | ||
514 | unsigned long ac_ex_scanned; | ||
515 | __u16 ac_groups_scanned; | ||
516 | __u16 ac_found; | ||
517 | __u16 ac_tail; | ||
518 | __u16 ac_buddy; | ||
519 | __u16 ac_flags; /* allocation hints */ | ||
520 | __u8 ac_status; | ||
521 | __u8 ac_criteria; | ||
522 | __u8 ac_repeats; | ||
523 | __u8 ac_2order; /* if request is to allocate 2^N blocks and | ||
524 | * N > 0, the field stores N, otherwise 0 */ | ||
525 | __u8 ac_op; /* operation, for history only */ | ||
526 | struct page *ac_bitmap_page; | ||
527 | struct page *ac_buddy_page; | ||
528 | struct ext4_prealloc_space *ac_pa; | ||
529 | struct ext4_locality_group *ac_lg; | ||
530 | }; | ||
531 | |||
532 | #define AC_STATUS_CONTINUE 1 | ||
533 | #define AC_STATUS_FOUND 2 | ||
534 | #define AC_STATUS_BREAK 3 | ||
535 | |||
536 | struct ext4_mb_history { | ||
537 | struct ext4_free_extent orig; /* orig allocation */ | ||
538 | struct ext4_free_extent goal; /* goal allocation */ | ||
539 | struct ext4_free_extent result; /* result allocation */ | ||
540 | unsigned pid; | ||
541 | unsigned ino; | ||
542 | __u16 found; /* how many extents have been found */ | ||
543 | __u16 groups; /* how many groups have been scanned */ | ||
544 | __u16 tail; /* what tail broke some buddy */ | ||
545 | __u16 buddy; /* buddy the tail ^^^ broke */ | ||
546 | __u16 flags; | ||
547 | __u8 cr:3; /* which phase the result extent was found at */ | ||
548 | __u8 op:4; | ||
549 | __u8 merged:1; | ||
550 | }; | ||
551 | |||
552 | struct ext4_buddy { | ||
553 | struct page *bd_buddy_page; | ||
554 | void *bd_buddy; | ||
555 | struct page *bd_bitmap_page; | ||
556 | void *bd_bitmap; | ||
557 | struct ext4_group_info *bd_info; | ||
558 | struct super_block *bd_sb; | ||
559 | __u16 bd_blkbits; | ||
560 | ext4_group_t bd_group; | ||
561 | }; | ||
562 | #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) | ||
563 | #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) | ||
564 | |||
565 | #ifndef EXT4_MB_HISTORY | ||
566 | static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) | ||
567 | { | ||
568 | return; | ||
569 | } | ||
570 | #else | ||
571 | static void ext4_mb_store_history(struct ext4_allocation_context *ac); | ||
572 | #endif | ||
573 | |||
574 | #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) | ||
575 | |||
576 | static struct proc_dir_entry *proc_root_ext4; | ||
577 | struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); | ||
578 | ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | ||
579 | ext4_fsblk_t goal, unsigned long *count, int *errp); | ||
580 | |||
581 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
582 | ext4_group_t group); | ||
583 | static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); | ||
584 | static void ext4_mb_free_committed_blocks(struct super_block *); | ||
585 | static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
586 | struct ext4_buddy *e4b, sector_t block, | ||
587 | int count); | ||
588 | static void ext4_mb_put_pa(struct ext4_allocation_context *, | ||
589 | struct super_block *, struct ext4_prealloc_space *pa); | ||
590 | static int ext4_mb_init_per_dev_proc(struct super_block *sb); | ||
591 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); | ||
592 | |||
593 | |||
594 | static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) | ||
595 | { | ||
596 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
597 | |||
598 | bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
599 | } | ||
600 | |||
601 | static inline void ext4_unlock_group(struct super_block *sb, | ||
602 | ext4_group_t group) | ||
603 | { | ||
604 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
605 | |||
606 | bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); | ||
607 | } | ||
608 | |||
609 | static inline int ext4_is_group_locked(struct super_block *sb, | ||
610 | ext4_group_t group) | ||
611 | { | ||
612 | struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); | ||
613 | |||
614 | return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, | ||
615 | &(grinfo->bb_state)); | ||
616 | } | ||
617 | |||
618 | static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, | ||
619 | struct ext4_free_extent *fex) | ||
620 | { | ||
621 | ext4_fsblk_t block; | ||
622 | |||
623 | block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
624 | + fex->fe_start | ||
625 | + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
626 | return block; | ||
627 | } | ||
628 | |||
629 | #if BITS_PER_LONG == 64 | ||
630 | #define mb_correct_addr_and_bit(bit, addr) \ | ||
631 | { \ | ||
632 | bit += ((unsigned long) addr & 7UL) << 3; \ | ||
633 | addr = (void *) ((unsigned long) addr & ~7UL); \ | ||
634 | } | ||
635 | #elif BITS_PER_LONG == 32 | ||
636 | #define mb_correct_addr_and_bit(bit, addr) \ | ||
637 | { \ | ||
638 | bit += ((unsigned long) addr & 3UL) << 3; \ | ||
639 | addr = (void *) ((unsigned long) addr & ~3UL); \ | ||
640 | } | ||
641 | #else | ||
642 | #error "how many bits you are?!" | ||
643 | #endif | ||
644 | |||
645 | static inline int mb_test_bit(int bit, void *addr) | ||
646 | { | ||
647 | /* | ||
648 | * ext4_test_bit on architecture like powerpc | ||
649 | * needs unsigned long aligned address | ||
650 | */ | ||
651 | mb_correct_addr_and_bit(bit, addr); | ||
652 | return ext4_test_bit(bit, addr); | ||
653 | } | ||
654 | |||
655 | static inline void mb_set_bit(int bit, void *addr) | ||
656 | { | ||
657 | mb_correct_addr_and_bit(bit, addr); | ||
658 | ext4_set_bit(bit, addr); | ||
659 | } | ||
660 | |||
661 | static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) | ||
662 | { | ||
663 | mb_correct_addr_and_bit(bit, addr); | ||
664 | ext4_set_bit_atomic(lock, bit, addr); | ||
665 | } | ||
666 | |||
667 | static inline void mb_clear_bit(int bit, void *addr) | ||
668 | { | ||
669 | mb_correct_addr_and_bit(bit, addr); | ||
670 | ext4_clear_bit(bit, addr); | ||
671 | } | ||
672 | |||
673 | static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) | ||
674 | { | ||
675 | mb_correct_addr_and_bit(bit, addr); | ||
676 | ext4_clear_bit_atomic(lock, bit, addr); | ||
677 | } | ||
678 | |||
679 | static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) | ||
680 | { | ||
681 | char *bb; | ||
682 | |||
683 | /* FIXME!! is this needed */ | ||
684 | BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); | ||
685 | BUG_ON(max == NULL); | ||
686 | |||
687 | if (order > e4b->bd_blkbits + 1) { | ||
688 | *max = 0; | ||
689 | return NULL; | ||
690 | } | ||
691 | |||
692 | /* at order 0 we see each particular block */ | ||
693 | *max = 1 << (e4b->bd_blkbits + 3); | ||
694 | if (order == 0) | ||
695 | return EXT4_MB_BITMAP(e4b); | ||
696 | |||
697 | bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; | ||
698 | *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; | ||
699 | |||
700 | return bb; | ||
701 | } | ||
702 | |||
703 | #ifdef DOUBLE_CHECK | ||
704 | static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, | ||
705 | int first, int count) | ||
706 | { | ||
707 | int i; | ||
708 | struct super_block *sb = e4b->bd_sb; | ||
709 | |||
710 | if (unlikely(e4b->bd_info->bb_bitmap == NULL)) | ||
711 | return; | ||
712 | BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); | ||
713 | for (i = 0; i < count; i++) { | ||
714 | if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { | ||
715 | ext4_fsblk_t blocknr; | ||
716 | blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); | ||
717 | blocknr += first + i; | ||
718 | blocknr += | ||
719 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
720 | |||
721 | ext4_error(sb, __FUNCTION__, "double-free of inode" | ||
722 | " %lu's block %llu(bit %u in group %lu)\n", | ||
723 | inode ? inode->i_ino : 0, blocknr, | ||
724 | first + i, e4b->bd_group); | ||
725 | } | ||
726 | mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); | ||
727 | } | ||
728 | } | ||
729 | |||
730 | static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) | ||
731 | { | ||
732 | int i; | ||
733 | |||
734 | if (unlikely(e4b->bd_info->bb_bitmap == NULL)) | ||
735 | return; | ||
736 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
737 | for (i = 0; i < count; i++) { | ||
738 | BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); | ||
739 | mb_set_bit(first + i, e4b->bd_info->bb_bitmap); | ||
740 | } | ||
741 | } | ||
742 | |||
743 | static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | ||
744 | { | ||
745 | if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { | ||
746 | unsigned char *b1, *b2; | ||
747 | int i; | ||
748 | b1 = (unsigned char *) e4b->bd_info->bb_bitmap; | ||
749 | b2 = (unsigned char *) bitmap; | ||
750 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | ||
751 | if (b1[i] != b2[i]) { | ||
752 | printk("corruption in group %lu at byte %u(%u):" | ||
753 | " %x in copy != %x on disk/prealloc\n", | ||
754 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | ||
755 | BUG(); | ||
756 | } | ||
757 | } | ||
758 | } | ||
759 | } | ||
760 | |||
761 | #else | ||
762 | static inline void mb_free_blocks_double(struct inode *inode, | ||
763 | struct ext4_buddy *e4b, int first, int count) | ||
764 | { | ||
765 | return; | ||
766 | } | ||
767 | static inline void mb_mark_used_double(struct ext4_buddy *e4b, | ||
768 | int first, int count) | ||
769 | { | ||
770 | return; | ||
771 | } | ||
772 | static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | ||
773 | { | ||
774 | return; | ||
775 | } | ||
776 | #endif | ||
777 | |||
778 | #ifdef AGGRESSIVE_CHECK | ||
779 | |||
780 | #define MB_CHECK_ASSERT(assert) \ | ||
781 | do { \ | ||
782 | if (!(assert)) { \ | ||
783 | printk(KERN_EMERG \ | ||
784 | "Assertion failure in %s() at %s:%d: \"%s\"\n", \ | ||
785 | function, file, line, # assert); \ | ||
786 | BUG(); \ | ||
787 | } \ | ||
788 | } while (0) | ||
789 | |||
790 | static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, | ||
791 | const char *function, int line) | ||
792 | { | ||
793 | struct super_block *sb = e4b->bd_sb; | ||
794 | int order = e4b->bd_blkbits + 1; | ||
795 | int max; | ||
796 | int max2; | ||
797 | int i; | ||
798 | int j; | ||
799 | int k; | ||
800 | int count; | ||
801 | struct ext4_group_info *grp; | ||
802 | int fragments = 0; | ||
803 | int fstart; | ||
804 | struct list_head *cur; | ||
805 | void *buddy; | ||
806 | void *buddy2; | ||
807 | |||
808 | if (!test_opt(sb, MBALLOC)) | ||
809 | return 0; | ||
810 | |||
811 | { | ||
812 | static int mb_check_counter; | ||
813 | if (mb_check_counter++ % 100 != 0) | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | while (order > 1) { | ||
818 | buddy = mb_find_buddy(e4b, order, &max); | ||
819 | MB_CHECK_ASSERT(buddy); | ||
820 | buddy2 = mb_find_buddy(e4b, order - 1, &max2); | ||
821 | MB_CHECK_ASSERT(buddy2); | ||
822 | MB_CHECK_ASSERT(buddy != buddy2); | ||
823 | MB_CHECK_ASSERT(max * 2 == max2); | ||
824 | |||
825 | count = 0; | ||
826 | for (i = 0; i < max; i++) { | ||
827 | |||
828 | if (mb_test_bit(i, buddy)) { | ||
829 | /* only single bit in buddy2 may be 1 */ | ||
830 | if (!mb_test_bit(i << 1, buddy2)) { | ||
831 | MB_CHECK_ASSERT( | ||
832 | mb_test_bit((i<<1)+1, buddy2)); | ||
833 | } else if (!mb_test_bit((i << 1) + 1, buddy2)) { | ||
834 | MB_CHECK_ASSERT( | ||
835 | mb_test_bit(i << 1, buddy2)); | ||
836 | } | ||
837 | continue; | ||
838 | } | ||
839 | |||
840 | /* both bits in buddy2 must be 0 */ | ||
841 | MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); | ||
842 | MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); | ||
843 | |||
844 | for (j = 0; j < (1 << order); j++) { | ||
845 | k = (i * (1 << order)) + j; | ||
846 | MB_CHECK_ASSERT( | ||
847 | !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); | ||
848 | } | ||
849 | count++; | ||
850 | } | ||
851 | MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); | ||
852 | order--; | ||
853 | } | ||
854 | |||
855 | fstart = -1; | ||
856 | buddy = mb_find_buddy(e4b, 0, &max); | ||
857 | for (i = 0; i < max; i++) { | ||
858 | if (!mb_test_bit(i, buddy)) { | ||
859 | MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); | ||
860 | if (fstart == -1) { | ||
861 | fragments++; | ||
862 | fstart = i; | ||
863 | } | ||
864 | continue; | ||
865 | } | ||
866 | fstart = -1; | ||
867 | /* check used bits only */ | ||
868 | for (j = 0; j < e4b->bd_blkbits + 1; j++) { | ||
869 | buddy2 = mb_find_buddy(e4b, j, &max2); | ||
870 | k = i >> j; | ||
871 | MB_CHECK_ASSERT(k < max2); | ||
872 | MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); | ||
873 | } | ||
874 | } | ||
875 | MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); | ||
876 | MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); | ||
877 | |||
878 | grp = ext4_get_group_info(sb, e4b->bd_group); | ||
879 | buddy = mb_find_buddy(e4b, 0, &max); | ||
880 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
881 | ext4_group_t groupnr; | ||
882 | struct ext4_prealloc_space *pa; | ||
883 | pa = list_entry(cur, struct ext4_prealloc_space, group_list); | ||
884 | ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k); | ||
885 | MB_CHECK_ASSERT(groupnr == e4b->bd_group); | ||
886 | for (i = 0; i < pa->len; i++) | ||
887 | MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); | ||
888 | } | ||
889 | return 0; | ||
890 | } | ||
891 | #undef MB_CHECK_ASSERT | ||
892 | #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ | ||
893 | __FILE__, __FUNCTION__, __LINE__) | ||
894 | #else | ||
895 | #define mb_check_buddy(e4b) | ||
896 | #endif | ||
897 | |||
898 | /* FIXME!! need more doc */ | ||
899 | static void ext4_mb_mark_free_simple(struct super_block *sb, | ||
900 | void *buddy, unsigned first, int len, | ||
901 | struct ext4_group_info *grp) | ||
902 | { | ||
903 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
904 | unsigned short min; | ||
905 | unsigned short max; | ||
906 | unsigned short chunk; | ||
907 | unsigned short border; | ||
908 | |||
909 | BUG_ON(len >= EXT4_BLOCKS_PER_GROUP(sb)); | ||
910 | |||
911 | border = 2 << sb->s_blocksize_bits; | ||
912 | |||
913 | while (len > 0) { | ||
914 | /* find how many blocks can be covered since this position */ | ||
915 | max = ffs(first | border) - 1; | ||
916 | |||
917 | /* find how many blocks of power 2 we need to mark */ | ||
918 | min = fls(len) - 1; | ||
919 | |||
920 | if (max < min) | ||
921 | min = max; | ||
922 | chunk = 1 << min; | ||
923 | |||
924 | /* mark multiblock chunks only */ | ||
925 | grp->bb_counters[min]++; | ||
926 | if (min > 0) | ||
927 | mb_clear_bit(first >> min, | ||
928 | buddy + sbi->s_mb_offsets[min]); | ||
929 | |||
930 | len -= chunk; | ||
931 | first += chunk; | ||
932 | } | ||
933 | } | ||
934 | |||
935 | static void ext4_mb_generate_buddy(struct super_block *sb, | ||
936 | void *buddy, void *bitmap, ext4_group_t group) | ||
937 | { | ||
938 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
939 | unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); | ||
940 | unsigned short i = 0; | ||
941 | unsigned short first; | ||
942 | unsigned short len; | ||
943 | unsigned free = 0; | ||
944 | unsigned fragments = 0; | ||
945 | unsigned long long period = get_cycles(); | ||
946 | |||
947 | /* initialize buddy from bitmap which is aggregation | ||
948 | * of on-disk bitmap and preallocations */ | ||
949 | i = ext4_find_next_zero_bit(bitmap, max, 0); | ||
950 | grp->bb_first_free = i; | ||
951 | while (i < max) { | ||
952 | fragments++; | ||
953 | first = i; | ||
954 | i = ext4_find_next_bit(bitmap, max, i); | ||
955 | len = i - first; | ||
956 | free += len; | ||
957 | if (len > 1) | ||
958 | ext4_mb_mark_free_simple(sb, buddy, first, len, grp); | ||
959 | else | ||
960 | grp->bb_counters[0]++; | ||
961 | if (i < max) | ||
962 | i = ext4_find_next_zero_bit(bitmap, max, i); | ||
963 | } | ||
964 | grp->bb_fragments = fragments; | ||
965 | |||
966 | if (free != grp->bb_free) { | ||
967 | printk(KERN_DEBUG | ||
968 | "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", | ||
969 | group, free, grp->bb_free); | ||
970 | grp->bb_free = free; | ||
971 | } | ||
972 | |||
973 | clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); | ||
974 | |||
975 | period = get_cycles() - period; | ||
976 | spin_lock(&EXT4_SB(sb)->s_bal_lock); | ||
977 | EXT4_SB(sb)->s_mb_buddies_generated++; | ||
978 | EXT4_SB(sb)->s_mb_generation_time += period; | ||
979 | spin_unlock(&EXT4_SB(sb)->s_bal_lock); | ||
980 | } | ||
981 | |||
982 | /* The buddy information is attached the buddy cache inode | ||
983 | * for convenience. The information regarding each group | ||
984 | * is loaded via ext4_mb_load_buddy. The information involve | ||
985 | * block bitmap and buddy information. The information are | ||
986 | * stored in the inode as | ||
987 | * | ||
988 | * { page } | ||
989 | * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... | ||
990 | * | ||
991 | * | ||
992 | * one block each for bitmap and buddy information. | ||
993 | * So for each group we take up 2 blocks. A page can | ||
994 | * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. | ||
995 | * So it can have information regarding groups_per_page which | ||
996 | * is blocks_per_page/2 | ||
997 | */ | ||
998 | |||
999 | static int ext4_mb_init_cache(struct page *page, char *incore) | ||
1000 | { | ||
1001 | int blocksize; | ||
1002 | int blocks_per_page; | ||
1003 | int groups_per_page; | ||
1004 | int err = 0; | ||
1005 | int i; | ||
1006 | ext4_group_t first_group; | ||
1007 | int first_block; | ||
1008 | struct super_block *sb; | ||
1009 | struct buffer_head *bhs; | ||
1010 | struct buffer_head **bh; | ||
1011 | struct inode *inode; | ||
1012 | char *data; | ||
1013 | char *bitmap; | ||
1014 | |||
1015 | mb_debug("init page %lu\n", page->index); | ||
1016 | |||
1017 | inode = page->mapping->host; | ||
1018 | sb = inode->i_sb; | ||
1019 | blocksize = 1 << inode->i_blkbits; | ||
1020 | blocks_per_page = PAGE_CACHE_SIZE / blocksize; | ||
1021 | |||
1022 | groups_per_page = blocks_per_page >> 1; | ||
1023 | if (groups_per_page == 0) | ||
1024 | groups_per_page = 1; | ||
1025 | |||
1026 | /* allocate buffer_heads to read bitmaps */ | ||
1027 | if (groups_per_page > 1) { | ||
1028 | err = -ENOMEM; | ||
1029 | i = sizeof(struct buffer_head *) * groups_per_page; | ||
1030 | bh = kzalloc(i, GFP_NOFS); | ||
1031 | if (bh == NULL) | ||
1032 | goto out; | ||
1033 | } else | ||
1034 | bh = &bhs; | ||
1035 | |||
1036 | first_group = page->index * blocks_per_page / 2; | ||
1037 | |||
1038 | /* read all groups the page covers into the cache */ | ||
1039 | for (i = 0; i < groups_per_page; i++) { | ||
1040 | struct ext4_group_desc *desc; | ||
1041 | |||
1042 | if (first_group + i >= EXT4_SB(sb)->s_groups_count) | ||
1043 | break; | ||
1044 | |||
1045 | err = -EIO; | ||
1046 | desc = ext4_get_group_desc(sb, first_group + i, NULL); | ||
1047 | if (desc == NULL) | ||
1048 | goto out; | ||
1049 | |||
1050 | err = -ENOMEM; | ||
1051 | bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); | ||
1052 | if (bh[i] == NULL) | ||
1053 | goto out; | ||
1054 | |||
1055 | if (bh_uptodate_or_lock(bh[i])) | ||
1056 | continue; | ||
1057 | |||
1058 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
1059 | ext4_init_block_bitmap(sb, bh[i], | ||
1060 | first_group + i, desc); | ||
1061 | set_buffer_uptodate(bh[i]); | ||
1062 | unlock_buffer(bh[i]); | ||
1063 | continue; | ||
1064 | } | ||
1065 | get_bh(bh[i]); | ||
1066 | bh[i]->b_end_io = end_buffer_read_sync; | ||
1067 | submit_bh(READ, bh[i]); | ||
1068 | mb_debug("read bitmap for group %lu\n", first_group + i); | ||
1069 | } | ||
1070 | |||
1071 | /* wait for I/O completion */ | ||
1072 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1073 | wait_on_buffer(bh[i]); | ||
1074 | |||
1075 | err = -EIO; | ||
1076 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1077 | if (!buffer_uptodate(bh[i])) | ||
1078 | goto out; | ||
1079 | |||
1080 | first_block = page->index * blocks_per_page; | ||
1081 | for (i = 0; i < blocks_per_page; i++) { | ||
1082 | int group; | ||
1083 | struct ext4_group_info *grinfo; | ||
1084 | |||
1085 | group = (first_block + i) >> 1; | ||
1086 | if (group >= EXT4_SB(sb)->s_groups_count) | ||
1087 | break; | ||
1088 | |||
1089 | /* | ||
1090 | * data carry information regarding this | ||
1091 | * particular group in the format specified | ||
1092 | * above | ||
1093 | * | ||
1094 | */ | ||
1095 | data = page_address(page) + (i * blocksize); | ||
1096 | bitmap = bh[group - first_group]->b_data; | ||
1097 | |||
1098 | /* | ||
1099 | * We place the buddy block and bitmap block | ||
1100 | * close together | ||
1101 | */ | ||
1102 | if ((first_block + i) & 1) { | ||
1103 | /* this is block of buddy */ | ||
1104 | BUG_ON(incore == NULL); | ||
1105 | mb_debug("put buddy for group %u in page %lu/%x\n", | ||
1106 | group, page->index, i * blocksize); | ||
1107 | memset(data, 0xff, blocksize); | ||
1108 | grinfo = ext4_get_group_info(sb, group); | ||
1109 | grinfo->bb_fragments = 0; | ||
1110 | memset(grinfo->bb_counters, 0, | ||
1111 | sizeof(unsigned short)*(sb->s_blocksize_bits+2)); | ||
1112 | /* | ||
1113 | * incore got set to the group block bitmap below | ||
1114 | */ | ||
1115 | ext4_mb_generate_buddy(sb, data, incore, group); | ||
1116 | incore = NULL; | ||
1117 | } else { | ||
1118 | /* this is block of bitmap */ | ||
1119 | BUG_ON(incore != NULL); | ||
1120 | mb_debug("put bitmap for group %u in page %lu/%x\n", | ||
1121 | group, page->index, i * blocksize); | ||
1122 | |||
1123 | /* see comments in ext4_mb_put_pa() */ | ||
1124 | ext4_lock_group(sb, group); | ||
1125 | memcpy(data, bitmap, blocksize); | ||
1126 | |||
1127 | /* mark all preallocated blks used in in-core bitmap */ | ||
1128 | ext4_mb_generate_from_pa(sb, data, group); | ||
1129 | ext4_unlock_group(sb, group); | ||
1130 | |||
1131 | /* set incore so that the buddy information can be | ||
1132 | * generated using this | ||
1133 | */ | ||
1134 | incore = data; | ||
1135 | } | ||
1136 | } | ||
1137 | SetPageUptodate(page); | ||
1138 | |||
1139 | out: | ||
1140 | if (bh) { | ||
1141 | for (i = 0; i < groups_per_page && bh[i]; i++) | ||
1142 | brelse(bh[i]); | ||
1143 | if (bh != &bhs) | ||
1144 | kfree(bh); | ||
1145 | } | ||
1146 | return err; | ||
1147 | } | ||
1148 | |||
1149 | static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | ||
1150 | struct ext4_buddy *e4b) | ||
1151 | { | ||
1152 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1153 | struct inode *inode = sbi->s_buddy_cache; | ||
1154 | int blocks_per_page; | ||
1155 | int block; | ||
1156 | int pnum; | ||
1157 | int poff; | ||
1158 | struct page *page; | ||
1159 | |||
1160 | mb_debug("load group %lu\n", group); | ||
1161 | |||
1162 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1163 | |||
1164 | e4b->bd_blkbits = sb->s_blocksize_bits; | ||
1165 | e4b->bd_info = ext4_get_group_info(sb, group); | ||
1166 | e4b->bd_sb = sb; | ||
1167 | e4b->bd_group = group; | ||
1168 | e4b->bd_buddy_page = NULL; | ||
1169 | e4b->bd_bitmap_page = NULL; | ||
1170 | |||
1171 | /* | ||
1172 | * the buddy cache inode stores the block bitmap | ||
1173 | * and buddy information in consecutive blocks. | ||
1174 | * So for each group we need two blocks. | ||
1175 | */ | ||
1176 | block = group * 2; | ||
1177 | pnum = block / blocks_per_page; | ||
1178 | poff = block % blocks_per_page; | ||
1179 | |||
1180 | /* we could use find_or_create_page(), but it locks page | ||
1181 | * what we'd like to avoid in fast path ... */ | ||
1182 | page = find_get_page(inode->i_mapping, pnum); | ||
1183 | if (page == NULL || !PageUptodate(page)) { | ||
1184 | if (page) | ||
1185 | page_cache_release(page); | ||
1186 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1187 | if (page) { | ||
1188 | BUG_ON(page->mapping != inode->i_mapping); | ||
1189 | if (!PageUptodate(page)) { | ||
1190 | ext4_mb_init_cache(page, NULL); | ||
1191 | mb_cmp_bitmaps(e4b, page_address(page) + | ||
1192 | (poff * sb->s_blocksize)); | ||
1193 | } | ||
1194 | unlock_page(page); | ||
1195 | } | ||
1196 | } | ||
1197 | if (page == NULL || !PageUptodate(page)) | ||
1198 | goto err; | ||
1199 | e4b->bd_bitmap_page = page; | ||
1200 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | ||
1201 | mark_page_accessed(page); | ||
1202 | |||
1203 | block++; | ||
1204 | pnum = block / blocks_per_page; | ||
1205 | poff = block % blocks_per_page; | ||
1206 | |||
1207 | page = find_get_page(inode->i_mapping, pnum); | ||
1208 | if (page == NULL || !PageUptodate(page)) { | ||
1209 | if (page) | ||
1210 | page_cache_release(page); | ||
1211 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | ||
1212 | if (page) { | ||
1213 | BUG_ON(page->mapping != inode->i_mapping); | ||
1214 | if (!PageUptodate(page)) | ||
1215 | ext4_mb_init_cache(page, e4b->bd_bitmap); | ||
1216 | |||
1217 | unlock_page(page); | ||
1218 | } | ||
1219 | } | ||
1220 | if (page == NULL || !PageUptodate(page)) | ||
1221 | goto err; | ||
1222 | e4b->bd_buddy_page = page; | ||
1223 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | ||
1224 | mark_page_accessed(page); | ||
1225 | |||
1226 | BUG_ON(e4b->bd_bitmap_page == NULL); | ||
1227 | BUG_ON(e4b->bd_buddy_page == NULL); | ||
1228 | |||
1229 | return 0; | ||
1230 | |||
1231 | err: | ||
1232 | if (e4b->bd_bitmap_page) | ||
1233 | page_cache_release(e4b->bd_bitmap_page); | ||
1234 | if (e4b->bd_buddy_page) | ||
1235 | page_cache_release(e4b->bd_buddy_page); | ||
1236 | e4b->bd_buddy = NULL; | ||
1237 | e4b->bd_bitmap = NULL; | ||
1238 | return -EIO; | ||
1239 | } | ||
1240 | |||
1241 | static void ext4_mb_release_desc(struct ext4_buddy *e4b) | ||
1242 | { | ||
1243 | if (e4b->bd_bitmap_page) | ||
1244 | page_cache_release(e4b->bd_bitmap_page); | ||
1245 | if (e4b->bd_buddy_page) | ||
1246 | page_cache_release(e4b->bd_buddy_page); | ||
1247 | } | ||
1248 | |||
1249 | |||
1250 | static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) | ||
1251 | { | ||
1252 | int order = 1; | ||
1253 | void *bb; | ||
1254 | |||
1255 | BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); | ||
1256 | BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); | ||
1257 | |||
1258 | bb = EXT4_MB_BUDDY(e4b); | ||
1259 | while (order <= e4b->bd_blkbits + 1) { | ||
1260 | block = block >> 1; | ||
1261 | if (!mb_test_bit(block, bb)) { | ||
1262 | /* this block is part of buddy of order 'order' */ | ||
1263 | return order; | ||
1264 | } | ||
1265 | bb += 1 << (e4b->bd_blkbits - order); | ||
1266 | order++; | ||
1267 | } | ||
1268 | return 0; | ||
1269 | } | ||
1270 | |||
1271 | static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1272 | { | ||
1273 | __u32 *addr; | ||
1274 | |||
1275 | len = cur + len; | ||
1276 | while (cur < len) { | ||
1277 | if ((cur & 31) == 0 && (len - cur) >= 32) { | ||
1278 | /* fast path: clear whole word at once */ | ||
1279 | addr = bm + (cur >> 3); | ||
1280 | *addr = 0; | ||
1281 | cur += 32; | ||
1282 | continue; | ||
1283 | } | ||
1284 | mb_clear_bit_atomic(lock, cur, bm); | ||
1285 | cur++; | ||
1286 | } | ||
1287 | } | ||
1288 | |||
1289 | static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | ||
1290 | { | ||
1291 | __u32 *addr; | ||
1292 | |||
1293 | len = cur + len; | ||
1294 | while (cur < len) { | ||
1295 | if ((cur & 31) == 0 && (len - cur) >= 32) { | ||
1296 | /* fast path: set whole word at once */ | ||
1297 | addr = bm + (cur >> 3); | ||
1298 | *addr = 0xffffffff; | ||
1299 | cur += 32; | ||
1300 | continue; | ||
1301 | } | ||
1302 | mb_set_bit_atomic(lock, cur, bm); | ||
1303 | cur++; | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | ||
1308 | int first, int count) | ||
1309 | { | ||
1310 | int block = 0; | ||
1311 | int max = 0; | ||
1312 | int order; | ||
1313 | void *buddy; | ||
1314 | void *buddy2; | ||
1315 | struct super_block *sb = e4b->bd_sb; | ||
1316 | |||
1317 | BUG_ON(first + count > (sb->s_blocksize << 3)); | ||
1318 | BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); | ||
1319 | mb_check_buddy(e4b); | ||
1320 | mb_free_blocks_double(inode, e4b, first, count); | ||
1321 | |||
1322 | e4b->bd_info->bb_free += count; | ||
1323 | if (first < e4b->bd_info->bb_first_free) | ||
1324 | e4b->bd_info->bb_first_free = first; | ||
1325 | |||
1326 | /* let's maintain fragments counter */ | ||
1327 | if (first != 0) | ||
1328 | block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); | ||
1329 | if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) | ||
1330 | max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); | ||
1331 | if (block && max) | ||
1332 | e4b->bd_info->bb_fragments--; | ||
1333 | else if (!block && !max) | ||
1334 | e4b->bd_info->bb_fragments++; | ||
1335 | |||
1336 | /* let's maintain buddy itself */ | ||
1337 | while (count-- > 0) { | ||
1338 | block = first++; | ||
1339 | order = 0; | ||
1340 | |||
1341 | if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { | ||
1342 | ext4_fsblk_t blocknr; | ||
1343 | blocknr = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb); | ||
1344 | blocknr += block; | ||
1345 | blocknr += | ||
1346 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | ||
1347 | |||
1348 | ext4_error(sb, __FUNCTION__, "double-free of inode" | ||
1349 | " %lu's block %llu(bit %u in group %lu)\n", | ||
1350 | inode ? inode->i_ino : 0, blocknr, block, | ||
1351 | e4b->bd_group); | ||
1352 | } | ||
1353 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); | ||
1354 | e4b->bd_info->bb_counters[order]++; | ||
1355 | |||
1356 | /* start of the buddy */ | ||
1357 | buddy = mb_find_buddy(e4b, order, &max); | ||
1358 | |||
1359 | do { | ||
1360 | block &= ~1UL; | ||
1361 | if (mb_test_bit(block, buddy) || | ||
1362 | mb_test_bit(block + 1, buddy)) | ||
1363 | break; | ||
1364 | |||
1365 | /* both the buddies are free, try to coalesce them */ | ||
1366 | buddy2 = mb_find_buddy(e4b, order + 1, &max); | ||
1367 | |||
1368 | if (!buddy2) | ||
1369 | break; | ||
1370 | |||
1371 | if (order > 0) { | ||
1372 | /* for special purposes, we don't set | ||
1373 | * free bits in bitmap */ | ||
1374 | mb_set_bit(block, buddy); | ||
1375 | mb_set_bit(block + 1, buddy); | ||
1376 | } | ||
1377 | e4b->bd_info->bb_counters[order]--; | ||
1378 | e4b->bd_info->bb_counters[order]--; | ||
1379 | |||
1380 | block = block >> 1; | ||
1381 | order++; | ||
1382 | e4b->bd_info->bb_counters[order]++; | ||
1383 | |||
1384 | mb_clear_bit(block, buddy2); | ||
1385 | buddy = buddy2; | ||
1386 | } while (1); | ||
1387 | } | ||
1388 | mb_check_buddy(e4b); | ||
1389 | |||
1390 | return 0; | ||
1391 | } | ||
1392 | |||
1393 | static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, | ||
1394 | int needed, struct ext4_free_extent *ex) | ||
1395 | { | ||
1396 | int next = block; | ||
1397 | int max; | ||
1398 | int ord; | ||
1399 | void *buddy; | ||
1400 | |||
1401 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
1402 | BUG_ON(ex == NULL); | ||
1403 | |||
1404 | buddy = mb_find_buddy(e4b, order, &max); | ||
1405 | BUG_ON(buddy == NULL); | ||
1406 | BUG_ON(block >= max); | ||
1407 | if (mb_test_bit(block, buddy)) { | ||
1408 | ex->fe_len = 0; | ||
1409 | ex->fe_start = 0; | ||
1410 | ex->fe_group = 0; | ||
1411 | return 0; | ||
1412 | } | ||
1413 | |||
1414 | /* FIXME dorp order completely ? */ | ||
1415 | if (likely(order == 0)) { | ||
1416 | /* find actual order */ | ||
1417 | order = mb_find_order_for_block(e4b, block); | ||
1418 | block = block >> order; | ||
1419 | } | ||
1420 | |||
1421 | ex->fe_len = 1 << order; | ||
1422 | ex->fe_start = block << order; | ||
1423 | ex->fe_group = e4b->bd_group; | ||
1424 | |||
1425 | /* calc difference from given start */ | ||
1426 | next = next - ex->fe_start; | ||
1427 | ex->fe_len -= next; | ||
1428 | ex->fe_start += next; | ||
1429 | |||
1430 | while (needed > ex->fe_len && | ||
1431 | (buddy = mb_find_buddy(e4b, order, &max))) { | ||
1432 | |||
1433 | if (block + 1 >= max) | ||
1434 | break; | ||
1435 | |||
1436 | next = (block + 1) * (1 << order); | ||
1437 | if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) | ||
1438 | break; | ||
1439 | |||
1440 | ord = mb_find_order_for_block(e4b, next); | ||
1441 | |||
1442 | order = ord; | ||
1443 | block = next >> order; | ||
1444 | ex->fe_len += 1 << order; | ||
1445 | } | ||
1446 | |||
1447 | BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); | ||
1448 | return ex->fe_len; | ||
1449 | } | ||
1450 | |||
1451 | static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | ||
1452 | { | ||
1453 | int ord; | ||
1454 | int mlen = 0; | ||
1455 | int max = 0; | ||
1456 | int cur; | ||
1457 | int start = ex->fe_start; | ||
1458 | int len = ex->fe_len; | ||
1459 | unsigned ret = 0; | ||
1460 | int len0 = len; | ||
1461 | void *buddy; | ||
1462 | |||
1463 | BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); | ||
1464 | BUG_ON(e4b->bd_group != ex->fe_group); | ||
1465 | BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); | ||
1466 | mb_check_buddy(e4b); | ||
1467 | mb_mark_used_double(e4b, start, len); | ||
1468 | |||
1469 | e4b->bd_info->bb_free -= len; | ||
1470 | if (e4b->bd_info->bb_first_free == start) | ||
1471 | e4b->bd_info->bb_first_free += len; | ||
1472 | |||
1473 | /* let's maintain fragments counter */ | ||
1474 | if (start != 0) | ||
1475 | mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); | ||
1476 | if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) | ||
1477 | max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); | ||
1478 | if (mlen && max) | ||
1479 | e4b->bd_info->bb_fragments++; | ||
1480 | else if (!mlen && !max) | ||
1481 | e4b->bd_info->bb_fragments--; | ||
1482 | |||
1483 | /* let's maintain buddy itself */ | ||
1484 | while (len) { | ||
1485 | ord = mb_find_order_for_block(e4b, start); | ||
1486 | |||
1487 | if (((start >> ord) << ord) == start && len >= (1 << ord)) { | ||
1488 | /* the whole chunk may be allocated at once! */ | ||
1489 | mlen = 1 << ord; | ||
1490 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1491 | BUG_ON((start >> ord) >= max); | ||
1492 | mb_set_bit(start >> ord, buddy); | ||
1493 | e4b->bd_info->bb_counters[ord]--; | ||
1494 | start += mlen; | ||
1495 | len -= mlen; | ||
1496 | BUG_ON(len < 0); | ||
1497 | continue; | ||
1498 | } | ||
1499 | |||
1500 | /* store for history */ | ||
1501 | if (ret == 0) | ||
1502 | ret = len | (ord << 16); | ||
1503 | |||
1504 | /* we have to split large buddy */ | ||
1505 | BUG_ON(ord <= 0); | ||
1506 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1507 | mb_set_bit(start >> ord, buddy); | ||
1508 | e4b->bd_info->bb_counters[ord]--; | ||
1509 | |||
1510 | ord--; | ||
1511 | cur = (start >> ord) & ~1U; | ||
1512 | buddy = mb_find_buddy(e4b, ord, &max); | ||
1513 | mb_clear_bit(cur, buddy); | ||
1514 | mb_clear_bit(cur + 1, buddy); | ||
1515 | e4b->bd_info->bb_counters[ord]++; | ||
1516 | e4b->bd_info->bb_counters[ord]++; | ||
1517 | } | ||
1518 | |||
1519 | mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), | ||
1520 | EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | ||
1521 | mb_check_buddy(e4b); | ||
1522 | |||
1523 | return ret; | ||
1524 | } | ||
1525 | |||
1526 | /* | ||
1527 | * Must be called under group lock! | ||
1528 | */ | ||
1529 | static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, | ||
1530 | struct ext4_buddy *e4b) | ||
1531 | { | ||
1532 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1533 | int ret; | ||
1534 | |||
1535 | BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); | ||
1536 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); | ||
1537 | |||
1538 | ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); | ||
1539 | ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; | ||
1540 | ret = mb_mark_used(e4b, &ac->ac_b_ex); | ||
1541 | |||
1542 | /* preallocation can change ac_b_ex, thus we store actually | ||
1543 | * allocated blocks for history */ | ||
1544 | ac->ac_f_ex = ac->ac_b_ex; | ||
1545 | |||
1546 | ac->ac_status = AC_STATUS_FOUND; | ||
1547 | ac->ac_tail = ret & 0xffff; | ||
1548 | ac->ac_buddy = ret >> 16; | ||
1549 | |||
1550 | /* XXXXXXX: SUCH A HORRIBLE **CK */ | ||
1551 | /*FIXME!! Why ? */ | ||
1552 | ac->ac_bitmap_page = e4b->bd_bitmap_page; | ||
1553 | get_page(ac->ac_bitmap_page); | ||
1554 | ac->ac_buddy_page = e4b->bd_buddy_page; | ||
1555 | get_page(ac->ac_buddy_page); | ||
1556 | |||
1557 | /* store last allocated for subsequent stream allocation */ | ||
1558 | if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
1559 | spin_lock(&sbi->s_md_lock); | ||
1560 | sbi->s_mb_last_group = ac->ac_f_ex.fe_group; | ||
1561 | sbi->s_mb_last_start = ac->ac_f_ex.fe_start; | ||
1562 | spin_unlock(&sbi->s_md_lock); | ||
1563 | } | ||
1564 | } | ||
1565 | |||
1566 | /* | ||
1567 | * regular allocator, for general purposes allocation | ||
1568 | */ | ||
1569 | |||
1570 | static void ext4_mb_check_limits(struct ext4_allocation_context *ac, | ||
1571 | struct ext4_buddy *e4b, | ||
1572 | int finish_group) | ||
1573 | { | ||
1574 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1575 | struct ext4_free_extent *bex = &ac->ac_b_ex; | ||
1576 | struct ext4_free_extent *gex = &ac->ac_g_ex; | ||
1577 | struct ext4_free_extent ex; | ||
1578 | int max; | ||
1579 | |||
1580 | /* | ||
1581 | * We don't want to scan for a whole year | ||
1582 | */ | ||
1583 | if (ac->ac_found > sbi->s_mb_max_to_scan && | ||
1584 | !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
1585 | ac->ac_status = AC_STATUS_BREAK; | ||
1586 | return; | ||
1587 | } | ||
1588 | |||
1589 | /* | ||
1590 | * Haven't found good chunk so far, let's continue | ||
1591 | */ | ||
1592 | if (bex->fe_len < gex->fe_len) | ||
1593 | return; | ||
1594 | |||
1595 | if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) | ||
1596 | && bex->fe_group == e4b->bd_group) { | ||
1597 | /* recheck chunk's availability - we don't know | ||
1598 | * when it was found (within this lock-unlock | ||
1599 | * period or not) */ | ||
1600 | max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); | ||
1601 | if (max >= gex->fe_len) { | ||
1602 | ext4_mb_use_best_found(ac, e4b); | ||
1603 | return; | ||
1604 | } | ||
1605 | } | ||
1606 | } | ||
1607 | |||
1608 | /* | ||
1609 | * The routine checks whether found extent is good enough. If it is, | ||
1610 | * then the extent gets marked used and flag is set to the context | ||
1611 | * to stop scanning. Otherwise, the extent is compared with the | ||
1612 | * previous found extent and if new one is better, then it's stored | ||
1613 | * in the context. Later, the best found extent will be used, if | ||
1614 | * mballoc can't find good enough extent. | ||
1615 | * | ||
1616 | * FIXME: real allocation policy is to be designed yet! | ||
1617 | */ | ||
1618 | static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, | ||
1619 | struct ext4_free_extent *ex, | ||
1620 | struct ext4_buddy *e4b) | ||
1621 | { | ||
1622 | struct ext4_free_extent *bex = &ac->ac_b_ex; | ||
1623 | struct ext4_free_extent *gex = &ac->ac_g_ex; | ||
1624 | |||
1625 | BUG_ON(ex->fe_len <= 0); | ||
1626 | BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
1627 | BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
1628 | BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); | ||
1629 | |||
1630 | ac->ac_found++; | ||
1631 | |||
1632 | /* | ||
1633 | * The special case - take what you catch first | ||
1634 | */ | ||
1635 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
1636 | *bex = *ex; | ||
1637 | ext4_mb_use_best_found(ac, e4b); | ||
1638 | return; | ||
1639 | } | ||
1640 | |||
1641 | /* | ||
1642 | * Let's check whether the chuck is good enough | ||
1643 | */ | ||
1644 | if (ex->fe_len == gex->fe_len) { | ||
1645 | *bex = *ex; | ||
1646 | ext4_mb_use_best_found(ac, e4b); | ||
1647 | return; | ||
1648 | } | ||
1649 | |||
1650 | /* | ||
1651 | * If this is first found extent, just store it in the context | ||
1652 | */ | ||
1653 | if (bex->fe_len == 0) { | ||
1654 | *bex = *ex; | ||
1655 | return; | ||
1656 | } | ||
1657 | |||
1658 | /* | ||
1659 | * If new found extent is better, store it in the context | ||
1660 | */ | ||
1661 | if (bex->fe_len < gex->fe_len) { | ||
1662 | /* if the request isn't satisfied, any found extent | ||
1663 | * larger than previous best one is better */ | ||
1664 | if (ex->fe_len > bex->fe_len) | ||
1665 | *bex = *ex; | ||
1666 | } else if (ex->fe_len > gex->fe_len) { | ||
1667 | /* if the request is satisfied, then we try to find | ||
1668 | * an extent that still satisfy the request, but is | ||
1669 | * smaller than previous one */ | ||
1670 | if (ex->fe_len < bex->fe_len) | ||
1671 | *bex = *ex; | ||
1672 | } | ||
1673 | |||
1674 | ext4_mb_check_limits(ac, e4b, 0); | ||
1675 | } | ||
1676 | |||
1677 | static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, | ||
1678 | struct ext4_buddy *e4b) | ||
1679 | { | ||
1680 | struct ext4_free_extent ex = ac->ac_b_ex; | ||
1681 | ext4_group_t group = ex.fe_group; | ||
1682 | int max; | ||
1683 | int err; | ||
1684 | |||
1685 | BUG_ON(ex.fe_len <= 0); | ||
1686 | err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); | ||
1687 | if (err) | ||
1688 | return err; | ||
1689 | |||
1690 | ext4_lock_group(ac->ac_sb, group); | ||
1691 | max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); | ||
1692 | |||
1693 | if (max > 0) { | ||
1694 | ac->ac_b_ex = ex; | ||
1695 | ext4_mb_use_best_found(ac, e4b); | ||
1696 | } | ||
1697 | |||
1698 | ext4_unlock_group(ac->ac_sb, group); | ||
1699 | ext4_mb_release_desc(e4b); | ||
1700 | |||
1701 | return 0; | ||
1702 | } | ||
1703 | |||
1704 | static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, | ||
1705 | struct ext4_buddy *e4b) | ||
1706 | { | ||
1707 | ext4_group_t group = ac->ac_g_ex.fe_group; | ||
1708 | int max; | ||
1709 | int err; | ||
1710 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
1711 | struct ext4_super_block *es = sbi->s_es; | ||
1712 | struct ext4_free_extent ex; | ||
1713 | |||
1714 | if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) | ||
1715 | return 0; | ||
1716 | |||
1717 | err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); | ||
1718 | if (err) | ||
1719 | return err; | ||
1720 | |||
1721 | ext4_lock_group(ac->ac_sb, group); | ||
1722 | max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, | ||
1723 | ac->ac_g_ex.fe_len, &ex); | ||
1724 | |||
1725 | if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { | ||
1726 | ext4_fsblk_t start; | ||
1727 | |||
1728 | start = (e4b->bd_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) + | ||
1729 | ex.fe_start + le32_to_cpu(es->s_first_data_block); | ||
1730 | /* use do_div to get remainder (would be 64-bit modulo) */ | ||
1731 | if (do_div(start, sbi->s_stripe) == 0) { | ||
1732 | ac->ac_found++; | ||
1733 | ac->ac_b_ex = ex; | ||
1734 | ext4_mb_use_best_found(ac, e4b); | ||
1735 | } | ||
1736 | } else if (max >= ac->ac_g_ex.fe_len) { | ||
1737 | BUG_ON(ex.fe_len <= 0); | ||
1738 | BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); | ||
1739 | BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); | ||
1740 | ac->ac_found++; | ||
1741 | ac->ac_b_ex = ex; | ||
1742 | ext4_mb_use_best_found(ac, e4b); | ||
1743 | } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { | ||
1744 | /* Sometimes, caller may want to merge even small | ||
1745 | * number of blocks to an existing extent */ | ||
1746 | BUG_ON(ex.fe_len <= 0); | ||
1747 | BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); | ||
1748 | BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); | ||
1749 | ac->ac_found++; | ||
1750 | ac->ac_b_ex = ex; | ||
1751 | ext4_mb_use_best_found(ac, e4b); | ||
1752 | } | ||
1753 | ext4_unlock_group(ac->ac_sb, group); | ||
1754 | ext4_mb_release_desc(e4b); | ||
1755 | |||
1756 | return 0; | ||
1757 | } | ||
1758 | |||
1759 | /* | ||
1760 | * The routine scans buddy structures (not bitmap!) from given order | ||
1761 | * to max order and tries to find big enough chunk to satisfy the req | ||
1762 | */ | ||
1763 | static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, | ||
1764 | struct ext4_buddy *e4b) | ||
1765 | { | ||
1766 | struct super_block *sb = ac->ac_sb; | ||
1767 | struct ext4_group_info *grp = e4b->bd_info; | ||
1768 | void *buddy; | ||
1769 | int i; | ||
1770 | int k; | ||
1771 | int max; | ||
1772 | |||
1773 | BUG_ON(ac->ac_2order <= 0); | ||
1774 | for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { | ||
1775 | if (grp->bb_counters[i] == 0) | ||
1776 | continue; | ||
1777 | |||
1778 | buddy = mb_find_buddy(e4b, i, &max); | ||
1779 | BUG_ON(buddy == NULL); | ||
1780 | |||
1781 | k = ext4_find_next_zero_bit(buddy, max, 0); | ||
1782 | BUG_ON(k >= max); | ||
1783 | |||
1784 | ac->ac_found++; | ||
1785 | |||
1786 | ac->ac_b_ex.fe_len = 1 << i; | ||
1787 | ac->ac_b_ex.fe_start = k << i; | ||
1788 | ac->ac_b_ex.fe_group = e4b->bd_group; | ||
1789 | |||
1790 | ext4_mb_use_best_found(ac, e4b); | ||
1791 | |||
1792 | BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); | ||
1793 | |||
1794 | if (EXT4_SB(sb)->s_mb_stats) | ||
1795 | atomic_inc(&EXT4_SB(sb)->s_bal_2orders); | ||
1796 | |||
1797 | break; | ||
1798 | } | ||
1799 | } | ||
1800 | |||
1801 | /* | ||
1802 | * The routine scans the group and measures all found extents. | ||
1803 | * In order to optimize scanning, caller must pass number of | ||
1804 | * free blocks in the group, so the routine can know upper limit. | ||
1805 | */ | ||
1806 | static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, | ||
1807 | struct ext4_buddy *e4b) | ||
1808 | { | ||
1809 | struct super_block *sb = ac->ac_sb; | ||
1810 | void *bitmap = EXT4_MB_BITMAP(e4b); | ||
1811 | struct ext4_free_extent ex; | ||
1812 | int i; | ||
1813 | int free; | ||
1814 | |||
1815 | free = e4b->bd_info->bb_free; | ||
1816 | BUG_ON(free <= 0); | ||
1817 | |||
1818 | i = e4b->bd_info->bb_first_free; | ||
1819 | |||
1820 | while (free && ac->ac_status == AC_STATUS_CONTINUE) { | ||
1821 | i = ext4_find_next_zero_bit(bitmap, | ||
1822 | EXT4_BLOCKS_PER_GROUP(sb), i); | ||
1823 | if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { | ||
1824 | BUG_ON(free != 0); | ||
1825 | break; | ||
1826 | } | ||
1827 | |||
1828 | mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); | ||
1829 | BUG_ON(ex.fe_len <= 0); | ||
1830 | BUG_ON(free < ex.fe_len); | ||
1831 | |||
1832 | ext4_mb_measure_extent(ac, &ex, e4b); | ||
1833 | |||
1834 | i += ex.fe_len; | ||
1835 | free -= ex.fe_len; | ||
1836 | } | ||
1837 | |||
1838 | ext4_mb_check_limits(ac, e4b, 1); | ||
1839 | } | ||
1840 | |||
1841 | /* | ||
1842 | * This is a special case for storages like raid5 | ||
1843 | * we try to find stripe-aligned chunks for stripe-size requests | ||
1844 | * XXX should do so at least for multiples of stripe size as well | ||
1845 | */ | ||
1846 | static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, | ||
1847 | struct ext4_buddy *e4b) | ||
1848 | { | ||
1849 | struct super_block *sb = ac->ac_sb; | ||
1850 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1851 | void *bitmap = EXT4_MB_BITMAP(e4b); | ||
1852 | struct ext4_free_extent ex; | ||
1853 | ext4_fsblk_t first_group_block; | ||
1854 | ext4_fsblk_t a; | ||
1855 | ext4_grpblk_t i; | ||
1856 | int max; | ||
1857 | |||
1858 | BUG_ON(sbi->s_stripe == 0); | ||
1859 | |||
1860 | /* find first stripe-aligned block in group */ | ||
1861 | first_group_block = e4b->bd_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
1862 | + le32_to_cpu(sbi->s_es->s_first_data_block); | ||
1863 | a = first_group_block + sbi->s_stripe - 1; | ||
1864 | do_div(a, sbi->s_stripe); | ||
1865 | i = (a * sbi->s_stripe) - first_group_block; | ||
1866 | |||
1867 | while (i < EXT4_BLOCKS_PER_GROUP(sb)) { | ||
1868 | if (!mb_test_bit(i, bitmap)) { | ||
1869 | max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); | ||
1870 | if (max >= sbi->s_stripe) { | ||
1871 | ac->ac_found++; | ||
1872 | ac->ac_b_ex = ex; | ||
1873 | ext4_mb_use_best_found(ac, e4b); | ||
1874 | break; | ||
1875 | } | ||
1876 | } | ||
1877 | i += sbi->s_stripe; | ||
1878 | } | ||
1879 | } | ||
1880 | |||
1881 | static int ext4_mb_good_group(struct ext4_allocation_context *ac, | ||
1882 | ext4_group_t group, int cr) | ||
1883 | { | ||
1884 | unsigned free, fragments; | ||
1885 | unsigned i, bits; | ||
1886 | struct ext4_group_desc *desc; | ||
1887 | struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); | ||
1888 | |||
1889 | BUG_ON(cr < 0 || cr >= 4); | ||
1890 | BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); | ||
1891 | |||
1892 | free = grp->bb_free; | ||
1893 | fragments = grp->bb_fragments; | ||
1894 | if (free == 0) | ||
1895 | return 0; | ||
1896 | if (fragments == 0) | ||
1897 | return 0; | ||
1898 | |||
1899 | switch (cr) { | ||
1900 | case 0: | ||
1901 | BUG_ON(ac->ac_2order == 0); | ||
1902 | /* If this group is uninitialized, skip it initially */ | ||
1903 | desc = ext4_get_group_desc(ac->ac_sb, group, NULL); | ||
1904 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) | ||
1905 | return 0; | ||
1906 | |||
1907 | bits = ac->ac_sb->s_blocksize_bits + 1; | ||
1908 | for (i = ac->ac_2order; i <= bits; i++) | ||
1909 | if (grp->bb_counters[i] > 0) | ||
1910 | return 1; | ||
1911 | break; | ||
1912 | case 1: | ||
1913 | if ((free / fragments) >= ac->ac_g_ex.fe_len) | ||
1914 | return 1; | ||
1915 | break; | ||
1916 | case 2: | ||
1917 | if (free >= ac->ac_g_ex.fe_len) | ||
1918 | return 1; | ||
1919 | break; | ||
1920 | case 3: | ||
1921 | return 1; | ||
1922 | default: | ||
1923 | BUG(); | ||
1924 | } | ||
1925 | |||
1926 | return 0; | ||
1927 | } | ||
1928 | |||
1929 | static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | ||
1930 | { | ||
1931 | ext4_group_t group; | ||
1932 | ext4_group_t i; | ||
1933 | int cr; | ||
1934 | int err = 0; | ||
1935 | int bsbits; | ||
1936 | struct ext4_sb_info *sbi; | ||
1937 | struct super_block *sb; | ||
1938 | struct ext4_buddy e4b; | ||
1939 | loff_t size, isize; | ||
1940 | |||
1941 | sb = ac->ac_sb; | ||
1942 | sbi = EXT4_SB(sb); | ||
1943 | BUG_ON(ac->ac_status == AC_STATUS_FOUND); | ||
1944 | |||
1945 | /* first, try the goal */ | ||
1946 | err = ext4_mb_find_by_goal(ac, &e4b); | ||
1947 | if (err || ac->ac_status == AC_STATUS_FOUND) | ||
1948 | goto out; | ||
1949 | |||
1950 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
1951 | goto out; | ||
1952 | |||
1953 | /* | ||
1954 | * ac->ac2_order is set only if the fe_len is a power of 2 | ||
1955 | * if ac2_order is set we also set criteria to 0 so that we | ||
1956 | * try exact allocation using buddy. | ||
1957 | */ | ||
1958 | i = fls(ac->ac_g_ex.fe_len); | ||
1959 | ac->ac_2order = 0; | ||
1960 | /* | ||
1961 | * We search using buddy data only if the order of the request | ||
1962 | * is greater than equal to the sbi_s_mb_order2_reqs | ||
1963 | * You can tune it via /proc/fs/ext4/<partition>/order2_req | ||
1964 | */ | ||
1965 | if (i >= sbi->s_mb_order2_reqs) { | ||
1966 | /* | ||
1967 | * This should tell if fe_len is exactly power of 2 | ||
1968 | */ | ||
1969 | if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) | ||
1970 | ac->ac_2order = i - 1; | ||
1971 | } | ||
1972 | |||
1973 | bsbits = ac->ac_sb->s_blocksize_bits; | ||
1974 | /* if stream allocation is enabled, use global goal */ | ||
1975 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
1976 | isize = i_size_read(ac->ac_inode) >> bsbits; | ||
1977 | if (size < isize) | ||
1978 | size = isize; | ||
1979 | |||
1980 | if (size < sbi->s_mb_stream_request && | ||
1981 | (ac->ac_flags & EXT4_MB_HINT_DATA)) { | ||
1982 | /* TBD: may be hot point */ | ||
1983 | spin_lock(&sbi->s_md_lock); | ||
1984 | ac->ac_g_ex.fe_group = sbi->s_mb_last_group; | ||
1985 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; | ||
1986 | spin_unlock(&sbi->s_md_lock); | ||
1987 | } | ||
1988 | |||
1989 | /* searching for the right group start from the goal value specified */ | ||
1990 | group = ac->ac_g_ex.fe_group; | ||
1991 | |||
1992 | /* Let's just scan groups to find more-less suitable blocks */ | ||
1993 | cr = ac->ac_2order ? 0 : 1; | ||
1994 | /* | ||
1995 | * cr == 0 try to get exact allocation, | ||
1996 | * cr == 3 try to get anything | ||
1997 | */ | ||
1998 | repeat: | ||
1999 | for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { | ||
2000 | ac->ac_criteria = cr; | ||
2001 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { | ||
2002 | struct ext4_group_info *grp; | ||
2003 | struct ext4_group_desc *desc; | ||
2004 | |||
2005 | if (group == EXT4_SB(sb)->s_groups_count) | ||
2006 | group = 0; | ||
2007 | |||
2008 | /* quick check to skip empty groups */ | ||
2009 | grp = ext4_get_group_info(ac->ac_sb, group); | ||
2010 | if (grp->bb_free == 0) | ||
2011 | continue; | ||
2012 | |||
2013 | /* | ||
2014 | * if the group is already init we check whether it is | ||
2015 | * a good group and if not we don't load the buddy | ||
2016 | */ | ||
2017 | if (EXT4_MB_GRP_NEED_INIT(grp)) { | ||
2018 | /* | ||
2019 | * we need full data about the group | ||
2020 | * to make a good selection | ||
2021 | */ | ||
2022 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2023 | if (err) | ||
2024 | goto out; | ||
2025 | ext4_mb_release_desc(&e4b); | ||
2026 | } | ||
2027 | |||
2028 | /* | ||
2029 | * If the particular group doesn't satisfy our | ||
2030 | * criteria we continue with the next group | ||
2031 | */ | ||
2032 | if (!ext4_mb_good_group(ac, group, cr)) | ||
2033 | continue; | ||
2034 | |||
2035 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2036 | if (err) | ||
2037 | goto out; | ||
2038 | |||
2039 | ext4_lock_group(sb, group); | ||
2040 | if (!ext4_mb_good_group(ac, group, cr)) { | ||
2041 | /* someone did allocation from this group */ | ||
2042 | ext4_unlock_group(sb, group); | ||
2043 | ext4_mb_release_desc(&e4b); | ||
2044 | continue; | ||
2045 | } | ||
2046 | |||
2047 | ac->ac_groups_scanned++; | ||
2048 | desc = ext4_get_group_desc(sb, group, NULL); | ||
2049 | if (cr == 0 || (desc->bg_flags & | ||
2050 | cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && | ||
2051 | ac->ac_2order != 0)) | ||
2052 | ext4_mb_simple_scan_group(ac, &e4b); | ||
2053 | else if (cr == 1 && | ||
2054 | ac->ac_g_ex.fe_len == sbi->s_stripe) | ||
2055 | ext4_mb_scan_aligned(ac, &e4b); | ||
2056 | else | ||
2057 | ext4_mb_complex_scan_group(ac, &e4b); | ||
2058 | |||
2059 | ext4_unlock_group(sb, group); | ||
2060 | ext4_mb_release_desc(&e4b); | ||
2061 | |||
2062 | if (ac->ac_status != AC_STATUS_CONTINUE) | ||
2063 | break; | ||
2064 | } | ||
2065 | } | ||
2066 | |||
2067 | if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && | ||
2068 | !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { | ||
2069 | /* | ||
2070 | * We've been searching too long. Let's try to allocate | ||
2071 | * the best chunk we've found so far | ||
2072 | */ | ||
2073 | |||
2074 | ext4_mb_try_best_found(ac, &e4b); | ||
2075 | if (ac->ac_status != AC_STATUS_FOUND) { | ||
2076 | /* | ||
2077 | * Someone more lucky has already allocated it. | ||
2078 | * The only thing we can do is just take first | ||
2079 | * found block(s) | ||
2080 | printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); | ||
2081 | */ | ||
2082 | ac->ac_b_ex.fe_group = 0; | ||
2083 | ac->ac_b_ex.fe_start = 0; | ||
2084 | ac->ac_b_ex.fe_len = 0; | ||
2085 | ac->ac_status = AC_STATUS_CONTINUE; | ||
2086 | ac->ac_flags |= EXT4_MB_HINT_FIRST; | ||
2087 | cr = 3; | ||
2088 | atomic_inc(&sbi->s_mb_lost_chunks); | ||
2089 | goto repeat; | ||
2090 | } | ||
2091 | } | ||
2092 | out: | ||
2093 | return err; | ||
2094 | } | ||
2095 | |||
2096 | #ifdef EXT4_MB_HISTORY | ||
2097 | struct ext4_mb_proc_session { | ||
2098 | struct ext4_mb_history *history; | ||
2099 | struct super_block *sb; | ||
2100 | int start; | ||
2101 | int max; | ||
2102 | }; | ||
2103 | |||
2104 | static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, | ||
2105 | struct ext4_mb_history *hs, | ||
2106 | int first) | ||
2107 | { | ||
2108 | if (hs == s->history + s->max) | ||
2109 | hs = s->history; | ||
2110 | if (!first && hs == s->history + s->start) | ||
2111 | return NULL; | ||
2112 | while (hs->orig.fe_len == 0) { | ||
2113 | hs++; | ||
2114 | if (hs == s->history + s->max) | ||
2115 | hs = s->history; | ||
2116 | if (hs == s->history + s->start) | ||
2117 | return NULL; | ||
2118 | } | ||
2119 | return hs; | ||
2120 | } | ||
2121 | |||
2122 | static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) | ||
2123 | { | ||
2124 | struct ext4_mb_proc_session *s = seq->private; | ||
2125 | struct ext4_mb_history *hs; | ||
2126 | int l = *pos; | ||
2127 | |||
2128 | if (l == 0) | ||
2129 | return SEQ_START_TOKEN; | ||
2130 | hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); | ||
2131 | if (!hs) | ||
2132 | return NULL; | ||
2133 | while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); | ||
2134 | return hs; | ||
2135 | } | ||
2136 | |||
2137 | static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, | ||
2138 | loff_t *pos) | ||
2139 | { | ||
2140 | struct ext4_mb_proc_session *s = seq->private; | ||
2141 | struct ext4_mb_history *hs = v; | ||
2142 | |||
2143 | ++*pos; | ||
2144 | if (v == SEQ_START_TOKEN) | ||
2145 | return ext4_mb_history_skip_empty(s, s->history + s->start, 1); | ||
2146 | else | ||
2147 | return ext4_mb_history_skip_empty(s, ++hs, 0); | ||
2148 | } | ||
2149 | |||
2150 | static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) | ||
2151 | { | ||
2152 | char buf[25], buf2[25], buf3[25], *fmt; | ||
2153 | struct ext4_mb_history *hs = v; | ||
2154 | |||
2155 | if (v == SEQ_START_TOKEN) { | ||
2156 | seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " | ||
2157 | "%-5s %-2s %-5s %-5s %-5s %-6s\n", | ||
2158 | "pid", "inode", "original", "goal", "result", "found", | ||
2159 | "grps", "cr", "flags", "merge", "tail", "broken"); | ||
2160 | return 0; | ||
2161 | } | ||
2162 | |||
2163 | if (hs->op == EXT4_MB_HISTORY_ALLOC) { | ||
2164 | fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " | ||
2165 | "%-5u %-5s %-5u %-6u\n"; | ||
2166 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | ||
2167 | hs->result.fe_start, hs->result.fe_len, | ||
2168 | hs->result.fe_logical); | ||
2169 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | ||
2170 | hs->orig.fe_start, hs->orig.fe_len, | ||
2171 | hs->orig.fe_logical); | ||
2172 | sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, | ||
2173 | hs->goal.fe_start, hs->goal.fe_len, | ||
2174 | hs->goal.fe_logical); | ||
2175 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, | ||
2176 | hs->found, hs->groups, hs->cr, hs->flags, | ||
2177 | hs->merged ? "M" : "", hs->tail, | ||
2178 | hs->buddy ? 1 << hs->buddy : 0); | ||
2179 | } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { | ||
2180 | fmt = "%-5u %-8u %-23s %-23s %-23s\n"; | ||
2181 | sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, | ||
2182 | hs->result.fe_start, hs->result.fe_len, | ||
2183 | hs->result.fe_logical); | ||
2184 | sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, | ||
2185 | hs->orig.fe_start, hs->orig.fe_len, | ||
2186 | hs->orig.fe_logical); | ||
2187 | seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); | ||
2188 | } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { | ||
2189 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | ||
2190 | hs->result.fe_start, hs->result.fe_len); | ||
2191 | seq_printf(seq, "%-5u %-8u %-23s discard\n", | ||
2192 | hs->pid, hs->ino, buf2); | ||
2193 | } else if (hs->op == EXT4_MB_HISTORY_FREE) { | ||
2194 | sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, | ||
2195 | hs->result.fe_start, hs->result.fe_len); | ||
2196 | seq_printf(seq, "%-5u %-8u %-23s free\n", | ||
2197 | hs->pid, hs->ino, buf2); | ||
2198 | } | ||
2199 | return 0; | ||
2200 | } | ||
2201 | |||
2202 | static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) | ||
2203 | { | ||
2204 | } | ||
2205 | |||
2206 | static struct seq_operations ext4_mb_seq_history_ops = { | ||
2207 | .start = ext4_mb_seq_history_start, | ||
2208 | .next = ext4_mb_seq_history_next, | ||
2209 | .stop = ext4_mb_seq_history_stop, | ||
2210 | .show = ext4_mb_seq_history_show, | ||
2211 | }; | ||
2212 | |||
2213 | static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) | ||
2214 | { | ||
2215 | struct super_block *sb = PDE(inode)->data; | ||
2216 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2217 | struct ext4_mb_proc_session *s; | ||
2218 | int rc; | ||
2219 | int size; | ||
2220 | |||
2221 | s = kmalloc(sizeof(*s), GFP_KERNEL); | ||
2222 | if (s == NULL) | ||
2223 | return -ENOMEM; | ||
2224 | s->sb = sb; | ||
2225 | size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; | ||
2226 | s->history = kmalloc(size, GFP_KERNEL); | ||
2227 | if (s->history == NULL) { | ||
2228 | kfree(s); | ||
2229 | return -ENOMEM; | ||
2230 | } | ||
2231 | |||
2232 | spin_lock(&sbi->s_mb_history_lock); | ||
2233 | memcpy(s->history, sbi->s_mb_history, size); | ||
2234 | s->max = sbi->s_mb_history_max; | ||
2235 | s->start = sbi->s_mb_history_cur % s->max; | ||
2236 | spin_unlock(&sbi->s_mb_history_lock); | ||
2237 | |||
2238 | rc = seq_open(file, &ext4_mb_seq_history_ops); | ||
2239 | if (rc == 0) { | ||
2240 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
2241 | m->private = s; | ||
2242 | } else { | ||
2243 | kfree(s->history); | ||
2244 | kfree(s); | ||
2245 | } | ||
2246 | return rc; | ||
2247 | |||
2248 | } | ||
2249 | |||
2250 | static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) | ||
2251 | { | ||
2252 | struct seq_file *seq = (struct seq_file *)file->private_data; | ||
2253 | struct ext4_mb_proc_session *s = seq->private; | ||
2254 | kfree(s->history); | ||
2255 | kfree(s); | ||
2256 | return seq_release(inode, file); | ||
2257 | } | ||
2258 | |||
2259 | static ssize_t ext4_mb_seq_history_write(struct file *file, | ||
2260 | const char __user *buffer, | ||
2261 | size_t count, loff_t *ppos) | ||
2262 | { | ||
2263 | struct seq_file *seq = (struct seq_file *)file->private_data; | ||
2264 | struct ext4_mb_proc_session *s = seq->private; | ||
2265 | struct super_block *sb = s->sb; | ||
2266 | char str[32]; | ||
2267 | int value; | ||
2268 | |||
2269 | if (count >= sizeof(str)) { | ||
2270 | printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", | ||
2271 | "mb_history", (int)sizeof(str)); | ||
2272 | return -EOVERFLOW; | ||
2273 | } | ||
2274 | |||
2275 | if (copy_from_user(str, buffer, count)) | ||
2276 | return -EFAULT; | ||
2277 | |||
2278 | value = simple_strtol(str, NULL, 0); | ||
2279 | if (value < 0) | ||
2280 | return -ERANGE; | ||
2281 | EXT4_SB(sb)->s_mb_history_filter = value; | ||
2282 | |||
2283 | return count; | ||
2284 | } | ||
2285 | |||
2286 | static struct file_operations ext4_mb_seq_history_fops = { | ||
2287 | .owner = THIS_MODULE, | ||
2288 | .open = ext4_mb_seq_history_open, | ||
2289 | .read = seq_read, | ||
2290 | .write = ext4_mb_seq_history_write, | ||
2291 | .llseek = seq_lseek, | ||
2292 | .release = ext4_mb_seq_history_release, | ||
2293 | }; | ||
2294 | |||
2295 | static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) | ||
2296 | { | ||
2297 | struct super_block *sb = seq->private; | ||
2298 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2299 | ext4_group_t group; | ||
2300 | |||
2301 | if (*pos < 0 || *pos >= sbi->s_groups_count) | ||
2302 | return NULL; | ||
2303 | |||
2304 | group = *pos + 1; | ||
2305 | return (void *) group; | ||
2306 | } | ||
2307 | |||
2308 | static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) | ||
2309 | { | ||
2310 | struct super_block *sb = seq->private; | ||
2311 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2312 | ext4_group_t group; | ||
2313 | |||
2314 | ++*pos; | ||
2315 | if (*pos < 0 || *pos >= sbi->s_groups_count) | ||
2316 | return NULL; | ||
2317 | group = *pos + 1; | ||
2318 | return (void *) group;; | ||
2319 | } | ||
2320 | |||
2321 | static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) | ||
2322 | { | ||
2323 | struct super_block *sb = seq->private; | ||
2324 | long group = (long) v; | ||
2325 | int i; | ||
2326 | int err; | ||
2327 | struct ext4_buddy e4b; | ||
2328 | struct sg { | ||
2329 | struct ext4_group_info info; | ||
2330 | unsigned short counters[16]; | ||
2331 | } sg; | ||
2332 | |||
2333 | group--; | ||
2334 | if (group == 0) | ||
2335 | seq_printf(seq, "#%-5s: %-5s %-5s %-5s " | ||
2336 | "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " | ||
2337 | "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", | ||
2338 | "group", "free", "frags", "first", | ||
2339 | "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", | ||
2340 | "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); | ||
2341 | |||
2342 | i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + | ||
2343 | sizeof(struct ext4_group_info); | ||
2344 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
2345 | if (err) { | ||
2346 | seq_printf(seq, "#%-5lu: I/O error\n", group); | ||
2347 | return 0; | ||
2348 | } | ||
2349 | ext4_lock_group(sb, group); | ||
2350 | memcpy(&sg, ext4_get_group_info(sb, group), i); | ||
2351 | ext4_unlock_group(sb, group); | ||
2352 | ext4_mb_release_desc(&e4b); | ||
2353 | |||
2354 | seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, | ||
2355 | sg.info.bb_fragments, sg.info.bb_first_free); | ||
2356 | for (i = 0; i <= 13; i++) | ||
2357 | seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? | ||
2358 | sg.info.bb_counters[i] : 0); | ||
2359 | seq_printf(seq, " ]\n"); | ||
2360 | |||
2361 | return 0; | ||
2362 | } | ||
2363 | |||
2364 | static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) | ||
2365 | { | ||
2366 | } | ||
2367 | |||
2368 | static struct seq_operations ext4_mb_seq_groups_ops = { | ||
2369 | .start = ext4_mb_seq_groups_start, | ||
2370 | .next = ext4_mb_seq_groups_next, | ||
2371 | .stop = ext4_mb_seq_groups_stop, | ||
2372 | .show = ext4_mb_seq_groups_show, | ||
2373 | }; | ||
2374 | |||
2375 | static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) | ||
2376 | { | ||
2377 | struct super_block *sb = PDE(inode)->data; | ||
2378 | int rc; | ||
2379 | |||
2380 | rc = seq_open(file, &ext4_mb_seq_groups_ops); | ||
2381 | if (rc == 0) { | ||
2382 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
2383 | m->private = sb; | ||
2384 | } | ||
2385 | return rc; | ||
2386 | |||
2387 | } | ||
2388 | |||
2389 | static struct file_operations ext4_mb_seq_groups_fops = { | ||
2390 | .owner = THIS_MODULE, | ||
2391 | .open = ext4_mb_seq_groups_open, | ||
2392 | .read = seq_read, | ||
2393 | .llseek = seq_lseek, | ||
2394 | .release = seq_release, | ||
2395 | }; | ||
2396 | |||
2397 | static void ext4_mb_history_release(struct super_block *sb) | ||
2398 | { | ||
2399 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2400 | |||
2401 | remove_proc_entry("mb_groups", sbi->s_mb_proc); | ||
2402 | remove_proc_entry("mb_history", sbi->s_mb_proc); | ||
2403 | |||
2404 | kfree(sbi->s_mb_history); | ||
2405 | } | ||
2406 | |||
2407 | static void ext4_mb_history_init(struct super_block *sb) | ||
2408 | { | ||
2409 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2410 | int i; | ||
2411 | |||
2412 | if (sbi->s_mb_proc != NULL) { | ||
2413 | struct proc_dir_entry *p; | ||
2414 | p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); | ||
2415 | if (p) { | ||
2416 | p->proc_fops = &ext4_mb_seq_history_fops; | ||
2417 | p->data = sb; | ||
2418 | } | ||
2419 | p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); | ||
2420 | if (p) { | ||
2421 | p->proc_fops = &ext4_mb_seq_groups_fops; | ||
2422 | p->data = sb; | ||
2423 | } | ||
2424 | } | ||
2425 | |||
2426 | sbi->s_mb_history_max = 1000; | ||
2427 | sbi->s_mb_history_cur = 0; | ||
2428 | spin_lock_init(&sbi->s_mb_history_lock); | ||
2429 | i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); | ||
2430 | sbi->s_mb_history = kmalloc(i, GFP_KERNEL); | ||
2431 | if (likely(sbi->s_mb_history != NULL)) | ||
2432 | memset(sbi->s_mb_history, 0, i); | ||
2433 | /* if we can't allocate history, then we simple won't use it */ | ||
2434 | } | ||
2435 | |||
2436 | static void ext4_mb_store_history(struct ext4_allocation_context *ac) | ||
2437 | { | ||
2438 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
2439 | struct ext4_mb_history h; | ||
2440 | |||
2441 | if (unlikely(sbi->s_mb_history == NULL)) | ||
2442 | return; | ||
2443 | |||
2444 | if (!(ac->ac_op & sbi->s_mb_history_filter)) | ||
2445 | return; | ||
2446 | |||
2447 | h.op = ac->ac_op; | ||
2448 | h.pid = current->pid; | ||
2449 | h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; | ||
2450 | h.orig = ac->ac_o_ex; | ||
2451 | h.result = ac->ac_b_ex; | ||
2452 | h.flags = ac->ac_flags; | ||
2453 | h.found = ac->ac_found; | ||
2454 | h.groups = ac->ac_groups_scanned; | ||
2455 | h.cr = ac->ac_criteria; | ||
2456 | h.tail = ac->ac_tail; | ||
2457 | h.buddy = ac->ac_buddy; | ||
2458 | h.merged = 0; | ||
2459 | if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { | ||
2460 | if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && | ||
2461 | ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) | ||
2462 | h.merged = 1; | ||
2463 | h.goal = ac->ac_g_ex; | ||
2464 | h.result = ac->ac_f_ex; | ||
2465 | } | ||
2466 | |||
2467 | spin_lock(&sbi->s_mb_history_lock); | ||
2468 | memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); | ||
2469 | if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) | ||
2470 | sbi->s_mb_history_cur = 0; | ||
2471 | spin_unlock(&sbi->s_mb_history_lock); | ||
2472 | } | ||
2473 | |||
2474 | #else | ||
2475 | #define ext4_mb_history_release(sb) | ||
2476 | #define ext4_mb_history_init(sb) | ||
2477 | #endif | ||
2478 | |||
2479 | static int ext4_mb_init_backend(struct super_block *sb) | ||
2480 | { | ||
2481 | ext4_group_t i; | ||
2482 | int j, len, metalen; | ||
2483 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2484 | int num_meta_group_infos = | ||
2485 | (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> | ||
2486 | EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2487 | struct ext4_group_info **meta_group_info; | ||
2488 | |||
2489 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | ||
2490 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | ||
2491 | * So a two level scheme suffices for now. */ | ||
2492 | sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * | ||
2493 | num_meta_group_infos, GFP_KERNEL); | ||
2494 | if (sbi->s_group_info == NULL) { | ||
2495 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | ||
2496 | return -ENOMEM; | ||
2497 | } | ||
2498 | sbi->s_buddy_cache = new_inode(sb); | ||
2499 | if (sbi->s_buddy_cache == NULL) { | ||
2500 | printk(KERN_ERR "EXT4-fs: can't get new inode\n"); | ||
2501 | goto err_freesgi; | ||
2502 | } | ||
2503 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | ||
2504 | |||
2505 | metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2506 | for (i = 0; i < num_meta_group_infos; i++) { | ||
2507 | if ((i + 1) == num_meta_group_infos) | ||
2508 | metalen = sizeof(*meta_group_info) * | ||
2509 | (sbi->s_groups_count - | ||
2510 | (i << EXT4_DESC_PER_BLOCK_BITS(sb))); | ||
2511 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | ||
2512 | if (meta_group_info == NULL) { | ||
2513 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | ||
2514 | "buddy group\n"); | ||
2515 | goto err_freemeta; | ||
2516 | } | ||
2517 | sbi->s_group_info[i] = meta_group_info; | ||
2518 | } | ||
2519 | |||
2520 | /* | ||
2521 | * calculate needed size. if change bb_counters size, | ||
2522 | * don't forget about ext4_mb_generate_buddy() | ||
2523 | */ | ||
2524 | len = sizeof(struct ext4_group_info); | ||
2525 | len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); | ||
2526 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
2527 | struct ext4_group_desc *desc; | ||
2528 | |||
2529 | meta_group_info = | ||
2530 | sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | ||
2531 | j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); | ||
2532 | |||
2533 | meta_group_info[j] = kzalloc(len, GFP_KERNEL); | ||
2534 | if (meta_group_info[j] == NULL) { | ||
2535 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | ||
2536 | i--; | ||
2537 | goto err_freebuddy; | ||
2538 | } | ||
2539 | desc = ext4_get_group_desc(sb, i, NULL); | ||
2540 | if (desc == NULL) { | ||
2541 | printk(KERN_ERR | ||
2542 | "EXT4-fs: can't read descriptor %lu\n", i); | ||
2543 | goto err_freebuddy; | ||
2544 | } | ||
2545 | memset(meta_group_info[j], 0, len); | ||
2546 | set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | ||
2547 | &(meta_group_info[j]->bb_state)); | ||
2548 | |||
2549 | /* | ||
2550 | * initialize bb_free to be able to skip | ||
2551 | * empty groups without initialization | ||
2552 | */ | ||
2553 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
2554 | meta_group_info[j]->bb_free = | ||
2555 | ext4_free_blocks_after_init(sb, i, desc); | ||
2556 | } else { | ||
2557 | meta_group_info[j]->bb_free = | ||
2558 | le16_to_cpu(desc->bg_free_blocks_count); | ||
2559 | } | ||
2560 | |||
2561 | INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); | ||
2562 | |||
2563 | #ifdef DOUBLE_CHECK | ||
2564 | { | ||
2565 | struct buffer_head *bh; | ||
2566 | meta_group_info[j]->bb_bitmap = | ||
2567 | kmalloc(sb->s_blocksize, GFP_KERNEL); | ||
2568 | BUG_ON(meta_group_info[j]->bb_bitmap == NULL); | ||
2569 | bh = read_block_bitmap(sb, i); | ||
2570 | BUG_ON(bh == NULL); | ||
2571 | memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, | ||
2572 | sb->s_blocksize); | ||
2573 | put_bh(bh); | ||
2574 | } | ||
2575 | #endif | ||
2576 | |||
2577 | } | ||
2578 | |||
2579 | return 0; | ||
2580 | |||
2581 | err_freebuddy: | ||
2582 | while (i >= 0) { | ||
2583 | kfree(ext4_get_group_info(sb, i)); | ||
2584 | i--; | ||
2585 | } | ||
2586 | i = num_meta_group_infos; | ||
2587 | err_freemeta: | ||
2588 | while (--i >= 0) | ||
2589 | kfree(sbi->s_group_info[i]); | ||
2590 | iput(sbi->s_buddy_cache); | ||
2591 | err_freesgi: | ||
2592 | kfree(sbi->s_group_info); | ||
2593 | return -ENOMEM; | ||
2594 | } | ||
2595 | |||
2596 | int ext4_mb_init(struct super_block *sb, int needs_recovery) | ||
2597 | { | ||
2598 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2599 | unsigned i; | ||
2600 | unsigned offset; | ||
2601 | unsigned max; | ||
2602 | |||
2603 | if (!test_opt(sb, MBALLOC)) | ||
2604 | return 0; | ||
2605 | |||
2606 | i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); | ||
2607 | |||
2608 | sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); | ||
2609 | if (sbi->s_mb_offsets == NULL) { | ||
2610 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2611 | return -ENOMEM; | ||
2612 | } | ||
2613 | sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); | ||
2614 | if (sbi->s_mb_maxs == NULL) { | ||
2615 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2616 | kfree(sbi->s_mb_maxs); | ||
2617 | return -ENOMEM; | ||
2618 | } | ||
2619 | |||
2620 | /* order 0 is regular bitmap */ | ||
2621 | sbi->s_mb_maxs[0] = sb->s_blocksize << 3; | ||
2622 | sbi->s_mb_offsets[0] = 0; | ||
2623 | |||
2624 | i = 1; | ||
2625 | offset = 0; | ||
2626 | max = sb->s_blocksize << 2; | ||
2627 | do { | ||
2628 | sbi->s_mb_offsets[i] = offset; | ||
2629 | sbi->s_mb_maxs[i] = max; | ||
2630 | offset += 1 << (sb->s_blocksize_bits - i); | ||
2631 | max = max >> 1; | ||
2632 | i++; | ||
2633 | } while (i <= sb->s_blocksize_bits + 1); | ||
2634 | |||
2635 | /* init file for buddy data */ | ||
2636 | i = ext4_mb_init_backend(sb); | ||
2637 | if (i) { | ||
2638 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2639 | kfree(sbi->s_mb_offsets); | ||
2640 | kfree(sbi->s_mb_maxs); | ||
2641 | return i; | ||
2642 | } | ||
2643 | |||
2644 | spin_lock_init(&sbi->s_md_lock); | ||
2645 | INIT_LIST_HEAD(&sbi->s_active_transaction); | ||
2646 | INIT_LIST_HEAD(&sbi->s_closed_transaction); | ||
2647 | INIT_LIST_HEAD(&sbi->s_committed_transaction); | ||
2648 | spin_lock_init(&sbi->s_bal_lock); | ||
2649 | |||
2650 | sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; | ||
2651 | sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; | ||
2652 | sbi->s_mb_stats = MB_DEFAULT_STATS; | ||
2653 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; | ||
2654 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; | ||
2655 | sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; | ||
2656 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | ||
2657 | |||
2658 | i = sizeof(struct ext4_locality_group) * NR_CPUS; | ||
2659 | sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); | ||
2660 | if (sbi->s_locality_groups == NULL) { | ||
2661 | clear_opt(sbi->s_mount_opt, MBALLOC); | ||
2662 | kfree(sbi->s_mb_offsets); | ||
2663 | kfree(sbi->s_mb_maxs); | ||
2664 | return -ENOMEM; | ||
2665 | } | ||
2666 | for (i = 0; i < NR_CPUS; i++) { | ||
2667 | struct ext4_locality_group *lg; | ||
2668 | lg = &sbi->s_locality_groups[i]; | ||
2669 | mutex_init(&lg->lg_mutex); | ||
2670 | INIT_LIST_HEAD(&lg->lg_prealloc_list); | ||
2671 | spin_lock_init(&lg->lg_prealloc_lock); | ||
2672 | } | ||
2673 | |||
2674 | ext4_mb_init_per_dev_proc(sb); | ||
2675 | ext4_mb_history_init(sb); | ||
2676 | |||
2677 | printk("EXT4-fs: mballoc enabled\n"); | ||
2678 | return 0; | ||
2679 | } | ||
2680 | |||
2681 | /* need to called with ext4 group lock (ext4_lock_group) */ | ||
2682 | static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) | ||
2683 | { | ||
2684 | struct ext4_prealloc_space *pa; | ||
2685 | struct list_head *cur, *tmp; | ||
2686 | int count = 0; | ||
2687 | |||
2688 | list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { | ||
2689 | pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); | ||
2690 | list_del(&pa->pa_group_list); | ||
2691 | count++; | ||
2692 | kfree(pa); | ||
2693 | } | ||
2694 | if (count) | ||
2695 | mb_debug("mballoc: %u PAs left\n", count); | ||
2696 | |||
2697 | } | ||
2698 | |||
2699 | int ext4_mb_release(struct super_block *sb) | ||
2700 | { | ||
2701 | ext4_group_t i; | ||
2702 | int num_meta_group_infos; | ||
2703 | struct ext4_group_info *grinfo; | ||
2704 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2705 | |||
2706 | if (!test_opt(sb, MBALLOC)) | ||
2707 | return 0; | ||
2708 | |||
2709 | /* release freed, non-committed blocks */ | ||
2710 | spin_lock(&sbi->s_md_lock); | ||
2711 | list_splice_init(&sbi->s_closed_transaction, | ||
2712 | &sbi->s_committed_transaction); | ||
2713 | list_splice_init(&sbi->s_active_transaction, | ||
2714 | &sbi->s_committed_transaction); | ||
2715 | spin_unlock(&sbi->s_md_lock); | ||
2716 | ext4_mb_free_committed_blocks(sb); | ||
2717 | |||
2718 | if (sbi->s_group_info) { | ||
2719 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
2720 | grinfo = ext4_get_group_info(sb, i); | ||
2721 | #ifdef DOUBLE_CHECK | ||
2722 | kfree(grinfo->bb_bitmap); | ||
2723 | #endif | ||
2724 | ext4_lock_group(sb, i); | ||
2725 | ext4_mb_cleanup_pa(grinfo); | ||
2726 | ext4_unlock_group(sb, i); | ||
2727 | kfree(grinfo); | ||
2728 | } | ||
2729 | num_meta_group_infos = (sbi->s_groups_count + | ||
2730 | EXT4_DESC_PER_BLOCK(sb) - 1) >> | ||
2731 | EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2732 | for (i = 0; i < num_meta_group_infos; i++) | ||
2733 | kfree(sbi->s_group_info[i]); | ||
2734 | kfree(sbi->s_group_info); | ||
2735 | } | ||
2736 | kfree(sbi->s_mb_offsets); | ||
2737 | kfree(sbi->s_mb_maxs); | ||
2738 | if (sbi->s_buddy_cache) | ||
2739 | iput(sbi->s_buddy_cache); | ||
2740 | if (sbi->s_mb_stats) { | ||
2741 | printk(KERN_INFO | ||
2742 | "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", | ||
2743 | atomic_read(&sbi->s_bal_allocated), | ||
2744 | atomic_read(&sbi->s_bal_reqs), | ||
2745 | atomic_read(&sbi->s_bal_success)); | ||
2746 | printk(KERN_INFO | ||
2747 | "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " | ||
2748 | "%u 2^N hits, %u breaks, %u lost\n", | ||
2749 | atomic_read(&sbi->s_bal_ex_scanned), | ||
2750 | atomic_read(&sbi->s_bal_goals), | ||
2751 | atomic_read(&sbi->s_bal_2orders), | ||
2752 | atomic_read(&sbi->s_bal_breaks), | ||
2753 | atomic_read(&sbi->s_mb_lost_chunks)); | ||
2754 | printk(KERN_INFO | ||
2755 | "EXT4-fs: mballoc: %lu generated and it took %Lu\n", | ||
2756 | sbi->s_mb_buddies_generated++, | ||
2757 | sbi->s_mb_generation_time); | ||
2758 | printk(KERN_INFO | ||
2759 | "EXT4-fs: mballoc: %u preallocated, %u discarded\n", | ||
2760 | atomic_read(&sbi->s_mb_preallocated), | ||
2761 | atomic_read(&sbi->s_mb_discarded)); | ||
2762 | } | ||
2763 | |||
2764 | kfree(sbi->s_locality_groups); | ||
2765 | |||
2766 | ext4_mb_history_release(sb); | ||
2767 | ext4_mb_destroy_per_dev_proc(sb); | ||
2768 | |||
2769 | return 0; | ||
2770 | } | ||
2771 | |||
2772 | static void ext4_mb_free_committed_blocks(struct super_block *sb) | ||
2773 | { | ||
2774 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2775 | int err; | ||
2776 | int i; | ||
2777 | int count = 0; | ||
2778 | int count2 = 0; | ||
2779 | struct ext4_free_metadata *md; | ||
2780 | struct ext4_buddy e4b; | ||
2781 | |||
2782 | if (list_empty(&sbi->s_committed_transaction)) | ||
2783 | return; | ||
2784 | |||
2785 | /* there is committed blocks to be freed yet */ | ||
2786 | do { | ||
2787 | /* get next array of blocks */ | ||
2788 | md = NULL; | ||
2789 | spin_lock(&sbi->s_md_lock); | ||
2790 | if (!list_empty(&sbi->s_committed_transaction)) { | ||
2791 | md = list_entry(sbi->s_committed_transaction.next, | ||
2792 | struct ext4_free_metadata, list); | ||
2793 | list_del(&md->list); | ||
2794 | } | ||
2795 | spin_unlock(&sbi->s_md_lock); | ||
2796 | |||
2797 | if (md == NULL) | ||
2798 | break; | ||
2799 | |||
2800 | mb_debug("gonna free %u blocks in group %lu (0x%p):", | ||
2801 | md->num, md->group, md); | ||
2802 | |||
2803 | err = ext4_mb_load_buddy(sb, md->group, &e4b); | ||
2804 | /* we expect to find existing buddy because it's pinned */ | ||
2805 | BUG_ON(err != 0); | ||
2806 | |||
2807 | /* there are blocks to put in buddy to make them really free */ | ||
2808 | count += md->num; | ||
2809 | count2++; | ||
2810 | ext4_lock_group(sb, md->group); | ||
2811 | for (i = 0; i < md->num; i++) { | ||
2812 | mb_debug(" %u", md->blocks[i]); | ||
2813 | err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | ||
2814 | BUG_ON(err != 0); | ||
2815 | } | ||
2816 | mb_debug("\n"); | ||
2817 | ext4_unlock_group(sb, md->group); | ||
2818 | |||
2819 | /* balance refcounts from ext4_mb_free_metadata() */ | ||
2820 | page_cache_release(e4b.bd_buddy_page); | ||
2821 | page_cache_release(e4b.bd_bitmap_page); | ||
2822 | |||
2823 | kfree(md); | ||
2824 | ext4_mb_release_desc(&e4b); | ||
2825 | |||
2826 | } while (md); | ||
2827 | |||
2828 | mb_debug("freed %u blocks in %u structures\n", count, count2); | ||
2829 | } | ||
2830 | |||
2831 | #define EXT4_ROOT "ext4" | ||
2832 | #define EXT4_MB_STATS_NAME "stats" | ||
2833 | #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" | ||
2834 | #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" | ||
2835 | #define EXT4_MB_ORDER2_REQ "order2_req" | ||
2836 | #define EXT4_MB_STREAM_REQ "stream_req" | ||
2837 | #define EXT4_MB_GROUP_PREALLOC "group_prealloc" | ||
2838 | |||
2839 | |||
2840 | |||
2841 | #define MB_PROC_VALUE_READ(name) \ | ||
2842 | static int ext4_mb_read_##name(char *page, char **start, \ | ||
2843 | off_t off, int count, int *eof, void *data) \ | ||
2844 | { \ | ||
2845 | struct ext4_sb_info *sbi = data; \ | ||
2846 | int len; \ | ||
2847 | *eof = 1; \ | ||
2848 | if (off != 0) \ | ||
2849 | return 0; \ | ||
2850 | len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ | ||
2851 | *start = page; \ | ||
2852 | return len; \ | ||
2853 | } | ||
2854 | |||
2855 | #define MB_PROC_VALUE_WRITE(name) \ | ||
2856 | static int ext4_mb_write_##name(struct file *file, \ | ||
2857 | const char __user *buf, unsigned long cnt, void *data) \ | ||
2858 | { \ | ||
2859 | struct ext4_sb_info *sbi = data; \ | ||
2860 | char str[32]; \ | ||
2861 | long value; \ | ||
2862 | if (cnt >= sizeof(str)) \ | ||
2863 | return -EINVAL; \ | ||
2864 | if (copy_from_user(str, buf, cnt)) \ | ||
2865 | return -EFAULT; \ | ||
2866 | value = simple_strtol(str, NULL, 0); \ | ||
2867 | if (value <= 0) \ | ||
2868 | return -ERANGE; \ | ||
2869 | sbi->s_mb_##name = value; \ | ||
2870 | return cnt; \ | ||
2871 | } | ||
2872 | |||
2873 | MB_PROC_VALUE_READ(stats); | ||
2874 | MB_PROC_VALUE_WRITE(stats); | ||
2875 | MB_PROC_VALUE_READ(max_to_scan); | ||
2876 | MB_PROC_VALUE_WRITE(max_to_scan); | ||
2877 | MB_PROC_VALUE_READ(min_to_scan); | ||
2878 | MB_PROC_VALUE_WRITE(min_to_scan); | ||
2879 | MB_PROC_VALUE_READ(order2_reqs); | ||
2880 | MB_PROC_VALUE_WRITE(order2_reqs); | ||
2881 | MB_PROC_VALUE_READ(stream_request); | ||
2882 | MB_PROC_VALUE_WRITE(stream_request); | ||
2883 | MB_PROC_VALUE_READ(group_prealloc); | ||
2884 | MB_PROC_VALUE_WRITE(group_prealloc); | ||
2885 | |||
2886 | #define MB_PROC_HANDLER(name, var) \ | ||
2887 | do { \ | ||
2888 | proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ | ||
2889 | if (proc == NULL) { \ | ||
2890 | printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ | ||
2891 | goto err_out; \ | ||
2892 | } \ | ||
2893 | proc->data = sbi; \ | ||
2894 | proc->read_proc = ext4_mb_read_##var ; \ | ||
2895 | proc->write_proc = ext4_mb_write_##var; \ | ||
2896 | } while (0) | ||
2897 | |||
2898 | static int ext4_mb_init_per_dev_proc(struct super_block *sb) | ||
2899 | { | ||
2900 | mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; | ||
2901 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2902 | struct proc_dir_entry *proc; | ||
2903 | char devname[64]; | ||
2904 | |||
2905 | snprintf(devname, sizeof(devname) - 1, "%s", | ||
2906 | bdevname(sb->s_bdev, devname)); | ||
2907 | sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); | ||
2908 | |||
2909 | MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); | ||
2910 | MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); | ||
2911 | MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); | ||
2912 | MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); | ||
2913 | MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); | ||
2914 | MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); | ||
2915 | |||
2916 | return 0; | ||
2917 | |||
2918 | err_out: | ||
2919 | printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); | ||
2920 | remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); | ||
2921 | remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); | ||
2922 | remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); | ||
2923 | remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2924 | remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2925 | remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); | ||
2926 | remove_proc_entry(devname, proc_root_ext4); | ||
2927 | sbi->s_mb_proc = NULL; | ||
2928 | |||
2929 | return -ENOMEM; | ||
2930 | } | ||
2931 | |||
2932 | static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) | ||
2933 | { | ||
2934 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2935 | char devname[64]; | ||
2936 | |||
2937 | if (sbi->s_mb_proc == NULL) | ||
2938 | return -EINVAL; | ||
2939 | |||
2940 | snprintf(devname, sizeof(devname) - 1, "%s", | ||
2941 | bdevname(sb->s_bdev, devname)); | ||
2942 | remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); | ||
2943 | remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); | ||
2944 | remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); | ||
2945 | remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2946 | remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); | ||
2947 | remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); | ||
2948 | remove_proc_entry(devname, proc_root_ext4); | ||
2949 | |||
2950 | return 0; | ||
2951 | } | ||
2952 | |||
2953 | int __init init_ext4_mballoc(void) | ||
2954 | { | ||
2955 | ext4_pspace_cachep = | ||
2956 | kmem_cache_create("ext4_prealloc_space", | ||
2957 | sizeof(struct ext4_prealloc_space), | ||
2958 | 0, SLAB_RECLAIM_ACCOUNT, NULL); | ||
2959 | if (ext4_pspace_cachep == NULL) | ||
2960 | return -ENOMEM; | ||
2961 | |||
2962 | #ifdef CONFIG_PROC_FS | ||
2963 | proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs); | ||
2964 | if (proc_root_ext4 == NULL) | ||
2965 | printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT); | ||
2966 | #endif | ||
2967 | |||
2968 | return 0; | ||
2969 | } | ||
2970 | |||
2971 | void exit_ext4_mballoc(void) | ||
2972 | { | ||
2973 | /* XXX: synchronize_rcu(); */ | ||
2974 | kmem_cache_destroy(ext4_pspace_cachep); | ||
2975 | #ifdef CONFIG_PROC_FS | ||
2976 | remove_proc_entry(EXT4_ROOT, proc_root_fs); | ||
2977 | #endif | ||
2978 | } | ||
2979 | |||
2980 | |||
2981 | /* | ||
2982 | * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps | ||
2983 | * Returns 0 if success or error code | ||
2984 | */ | ||
2985 | static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | ||
2986 | handle_t *handle) | ||
2987 | { | ||
2988 | struct buffer_head *bitmap_bh = NULL; | ||
2989 | struct ext4_super_block *es; | ||
2990 | struct ext4_group_desc *gdp; | ||
2991 | struct buffer_head *gdp_bh; | ||
2992 | struct ext4_sb_info *sbi; | ||
2993 | struct super_block *sb; | ||
2994 | ext4_fsblk_t block; | ||
2995 | int err; | ||
2996 | |||
2997 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
2998 | BUG_ON(ac->ac_b_ex.fe_len <= 0); | ||
2999 | |||
3000 | sb = ac->ac_sb; | ||
3001 | sbi = EXT4_SB(sb); | ||
3002 | es = sbi->s_es; | ||
3003 | |||
3004 | ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, | ||
3005 | gdp->bg_free_blocks_count); | ||
3006 | |||
3007 | err = -EIO; | ||
3008 | bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); | ||
3009 | if (!bitmap_bh) | ||
3010 | goto out_err; | ||
3011 | |||
3012 | err = ext4_journal_get_write_access(handle, bitmap_bh); | ||
3013 | if (err) | ||
3014 | goto out_err; | ||
3015 | |||
3016 | err = -EIO; | ||
3017 | gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); | ||
3018 | if (!gdp) | ||
3019 | goto out_err; | ||
3020 | |||
3021 | err = ext4_journal_get_write_access(handle, gdp_bh); | ||
3022 | if (err) | ||
3023 | goto out_err; | ||
3024 | |||
3025 | block = ac->ac_b_ex.fe_group * EXT4_BLOCKS_PER_GROUP(sb) | ||
3026 | + ac->ac_b_ex.fe_start | ||
3027 | + le32_to_cpu(es->s_first_data_block); | ||
3028 | |||
3029 | if (block == ext4_block_bitmap(sb, gdp) || | ||
3030 | block == ext4_inode_bitmap(sb, gdp) || | ||
3031 | in_range(block, ext4_inode_table(sb, gdp), | ||
3032 | EXT4_SB(sb)->s_itb_per_group)) { | ||
3033 | |||
3034 | ext4_error(sb, __FUNCTION__, | ||
3035 | "Allocating block in system zone - block = %llu", | ||
3036 | block); | ||
3037 | } | ||
3038 | #ifdef AGGRESSIVE_CHECK | ||
3039 | { | ||
3040 | int i; | ||
3041 | for (i = 0; i < ac->ac_b_ex.fe_len; i++) { | ||
3042 | BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, | ||
3043 | bitmap_bh->b_data)); | ||
3044 | } | ||
3045 | } | ||
3046 | #endif | ||
3047 | mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, | ||
3048 | ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); | ||
3049 | |||
3050 | spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | ||
3051 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
3052 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | ||
3053 | gdp->bg_free_blocks_count = | ||
3054 | cpu_to_le16(ext4_free_blocks_after_init(sb, | ||
3055 | ac->ac_b_ex.fe_group, | ||
3056 | gdp)); | ||
3057 | } | ||
3058 | gdp->bg_free_blocks_count = | ||
3059 | cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) | ||
3060 | - ac->ac_b_ex.fe_len); | ||
3061 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); | ||
3062 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | ||
3063 | percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); | ||
3064 | |||
3065 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
3066 | if (err) | ||
3067 | goto out_err; | ||
3068 | err = ext4_journal_dirty_metadata(handle, gdp_bh); | ||
3069 | |||
3070 | out_err: | ||
3071 | sb->s_dirt = 1; | ||
3072 | put_bh(bitmap_bh); | ||
3073 | return err; | ||
3074 | } | ||
3075 | |||
3076 | /* | ||
3077 | * here we normalize request for locality group | ||
3078 | * Group request are normalized to s_strip size if we set the same via mount | ||
3079 | * option. If not we set it to s_mb_group_prealloc which can be configured via | ||
3080 | * /proc/fs/ext4/<partition>/group_prealloc | ||
3081 | * | ||
3082 | * XXX: should we try to preallocate more than the group has now? | ||
3083 | */ | ||
3084 | static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) | ||
3085 | { | ||
3086 | struct super_block *sb = ac->ac_sb; | ||
3087 | struct ext4_locality_group *lg = ac->ac_lg; | ||
3088 | |||
3089 | BUG_ON(lg == NULL); | ||
3090 | if (EXT4_SB(sb)->s_stripe) | ||
3091 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; | ||
3092 | else | ||
3093 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; | ||
3094 | mb_debug("#%u: goal %lu blocks for locality group\n", | ||
3095 | current->pid, ac->ac_g_ex.fe_len); | ||
3096 | } | ||
3097 | |||
3098 | /* | ||
3099 | * Normalization means making request better in terms of | ||
3100 | * size and alignment | ||
3101 | */ | ||
3102 | static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, | ||
3103 | struct ext4_allocation_request *ar) | ||
3104 | { | ||
3105 | int bsbits, max; | ||
3106 | ext4_lblk_t end; | ||
3107 | struct list_head *cur; | ||
3108 | loff_t size, orig_size, start_off; | ||
3109 | ext4_lblk_t start, orig_start; | ||
3110 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | ||
3111 | |||
3112 | /* do normalize only data requests, metadata requests | ||
3113 | do not need preallocation */ | ||
3114 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
3115 | return; | ||
3116 | |||
3117 | /* sometime caller may want exact blocks */ | ||
3118 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
3119 | return; | ||
3120 | |||
3121 | /* caller may indicate that preallocation isn't | ||
3122 | * required (it's a tail, for example) */ | ||
3123 | if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) | ||
3124 | return; | ||
3125 | |||
3126 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { | ||
3127 | ext4_mb_normalize_group_request(ac); | ||
3128 | return ; | ||
3129 | } | ||
3130 | |||
3131 | bsbits = ac->ac_sb->s_blocksize_bits; | ||
3132 | |||
3133 | /* first, let's learn actual file size | ||
3134 | * given current request is allocated */ | ||
3135 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
3136 | size = size << bsbits; | ||
3137 | if (size < i_size_read(ac->ac_inode)) | ||
3138 | size = i_size_read(ac->ac_inode); | ||
3139 | |||
3140 | /* max available blocks in a free group */ | ||
3141 | max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 - | ||
3142 | EXT4_SB(ac->ac_sb)->s_itb_per_group; | ||
3143 | |||
3144 | #define NRL_CHECK_SIZE(req, size, max,bits) \ | ||
3145 | (req <= (size) || max <= ((size) >> bits)) | ||
3146 | |||
3147 | /* first, try to predict filesize */ | ||
3148 | /* XXX: should this table be tunable? */ | ||
3149 | start_off = 0; | ||
3150 | if (size <= 16 * 1024) { | ||
3151 | size = 16 * 1024; | ||
3152 | } else if (size <= 32 * 1024) { | ||
3153 | size = 32 * 1024; | ||
3154 | } else if (size <= 64 * 1024) { | ||
3155 | size = 64 * 1024; | ||
3156 | } else if (size <= 128 * 1024) { | ||
3157 | size = 128 * 1024; | ||
3158 | } else if (size <= 256 * 1024) { | ||
3159 | size = 256 * 1024; | ||
3160 | } else if (size <= 512 * 1024) { | ||
3161 | size = 512 * 1024; | ||
3162 | } else if (size <= 1024 * 1024) { | ||
3163 | size = 1024 * 1024; | ||
3164 | } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) { | ||
3165 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3166 | (20 - bsbits)) << 20; | ||
3167 | size = 1024 * 1024; | ||
3168 | } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) { | ||
3169 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3170 | (22 - bsbits)) << 22; | ||
3171 | size = 4 * 1024 * 1024; | ||
3172 | } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, | ||
3173 | (8<<20)>>bsbits, max, bsbits)) { | ||
3174 | start_off = ((loff_t)ac->ac_o_ex.fe_logical >> | ||
3175 | (23 - bsbits)) << 23; | ||
3176 | size = 8 * 1024 * 1024; | ||
3177 | } else { | ||
3178 | start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; | ||
3179 | size = ac->ac_o_ex.fe_len << bsbits; | ||
3180 | } | ||
3181 | orig_size = size = size >> bsbits; | ||
3182 | orig_start = start = start_off >> bsbits; | ||
3183 | |||
3184 | /* don't cover already allocated blocks in selected range */ | ||
3185 | if (ar->pleft && start <= ar->lleft) { | ||
3186 | size -= ar->lleft + 1 - start; | ||
3187 | start = ar->lleft + 1; | ||
3188 | } | ||
3189 | if (ar->pright && start + size - 1 >= ar->lright) | ||
3190 | size -= start + size - ar->lright; | ||
3191 | |||
3192 | end = start + size; | ||
3193 | |||
3194 | /* check we don't cross already preallocated blocks */ | ||
3195 | rcu_read_lock(); | ||
3196 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3197 | struct ext4_prealloc_space *pa; | ||
3198 | unsigned long pa_end; | ||
3199 | |||
3200 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3201 | |||
3202 | if (pa->pa_deleted) | ||
3203 | continue; | ||
3204 | spin_lock(&pa->pa_lock); | ||
3205 | if (pa->pa_deleted) { | ||
3206 | spin_unlock(&pa->pa_lock); | ||
3207 | continue; | ||
3208 | } | ||
3209 | |||
3210 | pa_end = pa->pa_lstart + pa->pa_len; | ||
3211 | |||
3212 | /* PA must not overlap original request */ | ||
3213 | BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || | ||
3214 | ac->ac_o_ex.fe_logical < pa->pa_lstart)); | ||
3215 | |||
3216 | /* skip PA normalized request doesn't overlap with */ | ||
3217 | if (pa->pa_lstart >= end) { | ||
3218 | spin_unlock(&pa->pa_lock); | ||
3219 | continue; | ||
3220 | } | ||
3221 | if (pa_end <= start) { | ||
3222 | spin_unlock(&pa->pa_lock); | ||
3223 | continue; | ||
3224 | } | ||
3225 | BUG_ON(pa->pa_lstart <= start && pa_end >= end); | ||
3226 | |||
3227 | if (pa_end <= ac->ac_o_ex.fe_logical) { | ||
3228 | BUG_ON(pa_end < start); | ||
3229 | start = pa_end; | ||
3230 | } | ||
3231 | |||
3232 | if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { | ||
3233 | BUG_ON(pa->pa_lstart > end); | ||
3234 | end = pa->pa_lstart; | ||
3235 | } | ||
3236 | spin_unlock(&pa->pa_lock); | ||
3237 | } | ||
3238 | rcu_read_unlock(); | ||
3239 | size = end - start; | ||
3240 | |||
3241 | /* XXX: extra loop to check we really don't overlap preallocations */ | ||
3242 | rcu_read_lock(); | ||
3243 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3244 | struct ext4_prealloc_space *pa; | ||
3245 | unsigned long pa_end; | ||
3246 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3247 | spin_lock(&pa->pa_lock); | ||
3248 | if (pa->pa_deleted == 0) { | ||
3249 | pa_end = pa->pa_lstart + pa->pa_len; | ||
3250 | BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); | ||
3251 | } | ||
3252 | spin_unlock(&pa->pa_lock); | ||
3253 | } | ||
3254 | rcu_read_unlock(); | ||
3255 | |||
3256 | if (start + size <= ac->ac_o_ex.fe_logical && | ||
3257 | start > ac->ac_o_ex.fe_logical) { | ||
3258 | printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", | ||
3259 | (unsigned long) start, (unsigned long) size, | ||
3260 | (unsigned long) ac->ac_o_ex.fe_logical); | ||
3261 | } | ||
3262 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && | ||
3263 | start > ac->ac_o_ex.fe_logical); | ||
3264 | BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); | ||
3265 | |||
3266 | /* now prepare goal request */ | ||
3267 | |||
3268 | /* XXX: is it better to align blocks WRT to logical | ||
3269 | * placement or satisfy big request as is */ | ||
3270 | ac->ac_g_ex.fe_logical = start; | ||
3271 | ac->ac_g_ex.fe_len = size; | ||
3272 | |||
3273 | /* define goal start in order to merge */ | ||
3274 | if (ar->pright && (ar->lright == (start + size))) { | ||
3275 | /* merge to the right */ | ||
3276 | ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, | ||
3277 | &ac->ac_f_ex.fe_group, | ||
3278 | &ac->ac_f_ex.fe_start); | ||
3279 | ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; | ||
3280 | } | ||
3281 | if (ar->pleft && (ar->lleft + 1 == start)) { | ||
3282 | /* merge to the left */ | ||
3283 | ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, | ||
3284 | &ac->ac_f_ex.fe_group, | ||
3285 | &ac->ac_f_ex.fe_start); | ||
3286 | ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; | ||
3287 | } | ||
3288 | |||
3289 | mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, | ||
3290 | (unsigned) orig_size, (unsigned) start); | ||
3291 | } | ||
3292 | |||
3293 | static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) | ||
3294 | { | ||
3295 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
3296 | |||
3297 | if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { | ||
3298 | atomic_inc(&sbi->s_bal_reqs); | ||
3299 | atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); | ||
3300 | if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) | ||
3301 | atomic_inc(&sbi->s_bal_success); | ||
3302 | atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); | ||
3303 | if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && | ||
3304 | ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) | ||
3305 | atomic_inc(&sbi->s_bal_goals); | ||
3306 | if (ac->ac_found > sbi->s_mb_max_to_scan) | ||
3307 | atomic_inc(&sbi->s_bal_breaks); | ||
3308 | } | ||
3309 | |||
3310 | ext4_mb_store_history(ac); | ||
3311 | } | ||
3312 | |||
3313 | /* | ||
3314 | * use blocks preallocated to inode | ||
3315 | */ | ||
3316 | static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, | ||
3317 | struct ext4_prealloc_space *pa) | ||
3318 | { | ||
3319 | ext4_fsblk_t start; | ||
3320 | ext4_fsblk_t end; | ||
3321 | int len; | ||
3322 | |||
3323 | /* found preallocated blocks, use them */ | ||
3324 | start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); | ||
3325 | end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); | ||
3326 | len = end - start; | ||
3327 | ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, | ||
3328 | &ac->ac_b_ex.fe_start); | ||
3329 | ac->ac_b_ex.fe_len = len; | ||
3330 | ac->ac_status = AC_STATUS_FOUND; | ||
3331 | ac->ac_pa = pa; | ||
3332 | |||
3333 | BUG_ON(start < pa->pa_pstart); | ||
3334 | BUG_ON(start + len > pa->pa_pstart + pa->pa_len); | ||
3335 | BUG_ON(pa->pa_free < len); | ||
3336 | pa->pa_free -= len; | ||
3337 | |||
3338 | mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa); | ||
3339 | } | ||
3340 | |||
3341 | /* | ||
3342 | * use blocks preallocated to locality group | ||
3343 | */ | ||
3344 | static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | ||
3345 | struct ext4_prealloc_space *pa) | ||
3346 | { | ||
3347 | unsigned len = ac->ac_o_ex.fe_len; | ||
3348 | |||
3349 | ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, | ||
3350 | &ac->ac_b_ex.fe_group, | ||
3351 | &ac->ac_b_ex.fe_start); | ||
3352 | ac->ac_b_ex.fe_len = len; | ||
3353 | ac->ac_status = AC_STATUS_FOUND; | ||
3354 | ac->ac_pa = pa; | ||
3355 | |||
3356 | /* we don't correct pa_pstart or pa_plen here to avoid | ||
3357 | * possible race when tte group is being loaded concurrently | ||
3358 | * instead we correct pa later, after blocks are marked | ||
3359 | * in on-disk bitmap -- see ext4_mb_release_context() */ | ||
3360 | /* | ||
3361 | * FIXME!! but the other CPUs can look at this particular | ||
3362 | * pa and think that it have enought free blocks if we | ||
3363 | * don't update pa_free here right ? | ||
3364 | */ | ||
3365 | mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); | ||
3366 | } | ||
3367 | |||
3368 | /* | ||
3369 | * search goal blocks in preallocated space | ||
3370 | */ | ||
3371 | static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | ||
3372 | { | ||
3373 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | ||
3374 | struct ext4_locality_group *lg; | ||
3375 | struct ext4_prealloc_space *pa; | ||
3376 | struct list_head *cur; | ||
3377 | |||
3378 | /* only data can be preallocated */ | ||
3379 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
3380 | return 0; | ||
3381 | |||
3382 | /* first, try per-file preallocation */ | ||
3383 | rcu_read_lock(); | ||
3384 | list_for_each_rcu(cur, &ei->i_prealloc_list) { | ||
3385 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3386 | |||
3387 | /* all fields in this condition don't change, | ||
3388 | * so we can skip locking for them */ | ||
3389 | if (ac->ac_o_ex.fe_logical < pa->pa_lstart || | ||
3390 | ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) | ||
3391 | continue; | ||
3392 | |||
3393 | /* found preallocated blocks, use them */ | ||
3394 | spin_lock(&pa->pa_lock); | ||
3395 | if (pa->pa_deleted == 0 && pa->pa_free) { | ||
3396 | atomic_inc(&pa->pa_count); | ||
3397 | ext4_mb_use_inode_pa(ac, pa); | ||
3398 | spin_unlock(&pa->pa_lock); | ||
3399 | ac->ac_criteria = 10; | ||
3400 | rcu_read_unlock(); | ||
3401 | return 1; | ||
3402 | } | ||
3403 | spin_unlock(&pa->pa_lock); | ||
3404 | } | ||
3405 | rcu_read_unlock(); | ||
3406 | |||
3407 | /* can we use group allocation? */ | ||
3408 | if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) | ||
3409 | return 0; | ||
3410 | |||
3411 | /* inode may have no locality group for some reason */ | ||
3412 | lg = ac->ac_lg; | ||
3413 | if (lg == NULL) | ||
3414 | return 0; | ||
3415 | |||
3416 | rcu_read_lock(); | ||
3417 | list_for_each_rcu(cur, &lg->lg_prealloc_list) { | ||
3418 | pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); | ||
3419 | spin_lock(&pa->pa_lock); | ||
3420 | if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { | ||
3421 | atomic_inc(&pa->pa_count); | ||
3422 | ext4_mb_use_group_pa(ac, pa); | ||
3423 | spin_unlock(&pa->pa_lock); | ||
3424 | ac->ac_criteria = 20; | ||
3425 | rcu_read_unlock(); | ||
3426 | return 1; | ||
3427 | } | ||
3428 | spin_unlock(&pa->pa_lock); | ||
3429 | } | ||
3430 | rcu_read_unlock(); | ||
3431 | |||
3432 | return 0; | ||
3433 | } | ||
3434 | |||
3435 | /* | ||
3436 | * the function goes through all preallocation in this group and marks them | ||
3437 | * used in in-core bitmap. buddy must be generated from this bitmap | ||
3438 | * Need to be called with ext4 group lock (ext4_lock_group) | ||
3439 | */ | ||
3440 | static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | ||
3441 | ext4_group_t group) | ||
3442 | { | ||
3443 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
3444 | struct ext4_prealloc_space *pa; | ||
3445 | struct list_head *cur; | ||
3446 | ext4_group_t groupnr; | ||
3447 | ext4_grpblk_t start; | ||
3448 | int preallocated = 0; | ||
3449 | int count = 0; | ||
3450 | int len; | ||
3451 | |||
3452 | /* all form of preallocation discards first load group, | ||
3453 | * so the only competing code is preallocation use. | ||
3454 | * we don't need any locking here | ||
3455 | * notice we do NOT ignore preallocations with pa_deleted | ||
3456 | * otherwise we could leave used blocks available for | ||
3457 | * allocation in buddy when concurrent ext4_mb_put_pa() | ||
3458 | * is dropping preallocation | ||
3459 | */ | ||
3460 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
3461 | pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); | ||
3462 | spin_lock(&pa->pa_lock); | ||
3463 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, | ||
3464 | &groupnr, &start); | ||
3465 | len = pa->pa_len; | ||
3466 | spin_unlock(&pa->pa_lock); | ||
3467 | if (unlikely(len == 0)) | ||
3468 | continue; | ||
3469 | BUG_ON(groupnr != group); | ||
3470 | mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), | ||
3471 | bitmap, start, len); | ||
3472 | preallocated += len; | ||
3473 | count++; | ||
3474 | } | ||
3475 | mb_debug("prellocated %u for group %lu\n", preallocated, group); | ||
3476 | } | ||
3477 | |||
3478 | static void ext4_mb_pa_callback(struct rcu_head *head) | ||
3479 | { | ||
3480 | struct ext4_prealloc_space *pa; | ||
3481 | pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); | ||
3482 | kmem_cache_free(ext4_pspace_cachep, pa); | ||
3483 | } | ||
3484 | |||
3485 | /* | ||
3486 | * drops a reference to preallocated space descriptor | ||
3487 | * if this was the last reference and the space is consumed | ||
3488 | */ | ||
3489 | static void ext4_mb_put_pa(struct ext4_allocation_context *ac, | ||
3490 | struct super_block *sb, struct ext4_prealloc_space *pa) | ||
3491 | { | ||
3492 | unsigned long grp; | ||
3493 | |||
3494 | if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) | ||
3495 | return; | ||
3496 | |||
3497 | /* in this short window concurrent discard can set pa_deleted */ | ||
3498 | spin_lock(&pa->pa_lock); | ||
3499 | if (pa->pa_deleted == 1) { | ||
3500 | spin_unlock(&pa->pa_lock); | ||
3501 | return; | ||
3502 | } | ||
3503 | |||
3504 | pa->pa_deleted = 1; | ||
3505 | spin_unlock(&pa->pa_lock); | ||
3506 | |||
3507 | /* -1 is to protect from crossing allocation group */ | ||
3508 | ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); | ||
3509 | |||
3510 | /* | ||
3511 | * possible race: | ||
3512 | * | ||
3513 | * P1 (buddy init) P2 (regular allocation) | ||
3514 | * find block B in PA | ||
3515 | * copy on-disk bitmap to buddy | ||
3516 | * mark B in on-disk bitmap | ||
3517 | * drop PA from group | ||
3518 | * mark all PAs in buddy | ||
3519 | * | ||
3520 | * thus, P1 initializes buddy with B available. to prevent this | ||
3521 | * we make "copy" and "mark all PAs" atomic and serialize "drop PA" | ||
3522 | * against that pair | ||
3523 | */ | ||
3524 | ext4_lock_group(sb, grp); | ||
3525 | list_del(&pa->pa_group_list); | ||
3526 | ext4_unlock_group(sb, grp); | ||
3527 | |||
3528 | spin_lock(pa->pa_obj_lock); | ||
3529 | list_del_rcu(&pa->pa_inode_list); | ||
3530 | spin_unlock(pa->pa_obj_lock); | ||
3531 | |||
3532 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3533 | } | ||
3534 | |||
3535 | /* | ||
3536 | * creates new preallocated space for given inode | ||
3537 | */ | ||
3538 | static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) | ||
3539 | { | ||
3540 | struct super_block *sb = ac->ac_sb; | ||
3541 | struct ext4_prealloc_space *pa; | ||
3542 | struct ext4_group_info *grp; | ||
3543 | struct ext4_inode_info *ei; | ||
3544 | |||
3545 | /* preallocate only when found space is larger then requested */ | ||
3546 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); | ||
3547 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
3548 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); | ||
3549 | |||
3550 | pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); | ||
3551 | if (pa == NULL) | ||
3552 | return -ENOMEM; | ||
3553 | |||
3554 | if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { | ||
3555 | int winl; | ||
3556 | int wins; | ||
3557 | int win; | ||
3558 | int offs; | ||
3559 | |||
3560 | /* we can't allocate as much as normalizer wants. | ||
3561 | * so, found space must get proper lstart | ||
3562 | * to cover original request */ | ||
3563 | BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); | ||
3564 | BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); | ||
3565 | |||
3566 | /* we're limited by original request in that | ||
3567 | * logical block must be covered any way | ||
3568 | * winl is window we can move our chunk within */ | ||
3569 | winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; | ||
3570 | |||
3571 | /* also, we should cover whole original request */ | ||
3572 | wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; | ||
3573 | |||
3574 | /* the smallest one defines real window */ | ||
3575 | win = min(winl, wins); | ||
3576 | |||
3577 | offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; | ||
3578 | if (offs && offs < win) | ||
3579 | win = offs; | ||
3580 | |||
3581 | ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; | ||
3582 | BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); | ||
3583 | BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); | ||
3584 | } | ||
3585 | |||
3586 | /* preallocation can change ac_b_ex, thus we store actually | ||
3587 | * allocated blocks for history */ | ||
3588 | ac->ac_f_ex = ac->ac_b_ex; | ||
3589 | |||
3590 | pa->pa_lstart = ac->ac_b_ex.fe_logical; | ||
3591 | pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); | ||
3592 | pa->pa_len = ac->ac_b_ex.fe_len; | ||
3593 | pa->pa_free = pa->pa_len; | ||
3594 | atomic_set(&pa->pa_count, 1); | ||
3595 | spin_lock_init(&pa->pa_lock); | ||
3596 | pa->pa_deleted = 0; | ||
3597 | pa->pa_linear = 0; | ||
3598 | |||
3599 | mb_debug("new inode pa %p: %llu/%u for %u\n", pa, | ||
3600 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3601 | |||
3602 | ext4_mb_use_inode_pa(ac, pa); | ||
3603 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | ||
3604 | |||
3605 | ei = EXT4_I(ac->ac_inode); | ||
3606 | grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); | ||
3607 | |||
3608 | pa->pa_obj_lock = &ei->i_prealloc_lock; | ||
3609 | pa->pa_inode = ac->ac_inode; | ||
3610 | |||
3611 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | ||
3612 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | ||
3613 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | ||
3614 | |||
3615 | spin_lock(pa->pa_obj_lock); | ||
3616 | list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); | ||
3617 | spin_unlock(pa->pa_obj_lock); | ||
3618 | |||
3619 | return 0; | ||
3620 | } | ||
3621 | |||
3622 | /* | ||
3623 | * creates new preallocated space for locality group inodes belongs to | ||
3624 | */ | ||
3625 | static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | ||
3626 | { | ||
3627 | struct super_block *sb = ac->ac_sb; | ||
3628 | struct ext4_locality_group *lg; | ||
3629 | struct ext4_prealloc_space *pa; | ||
3630 | struct ext4_group_info *grp; | ||
3631 | |||
3632 | /* preallocate only when found space is larger then requested */ | ||
3633 | BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); | ||
3634 | BUG_ON(ac->ac_status != AC_STATUS_FOUND); | ||
3635 | BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); | ||
3636 | |||
3637 | BUG_ON(ext4_pspace_cachep == NULL); | ||
3638 | pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); | ||
3639 | if (pa == NULL) | ||
3640 | return -ENOMEM; | ||
3641 | |||
3642 | /* preallocation can change ac_b_ex, thus we store actually | ||
3643 | * allocated blocks for history */ | ||
3644 | ac->ac_f_ex = ac->ac_b_ex; | ||
3645 | |||
3646 | pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); | ||
3647 | pa->pa_lstart = pa->pa_pstart; | ||
3648 | pa->pa_len = ac->ac_b_ex.fe_len; | ||
3649 | pa->pa_free = pa->pa_len; | ||
3650 | atomic_set(&pa->pa_count, 1); | ||
3651 | spin_lock_init(&pa->pa_lock); | ||
3652 | pa->pa_deleted = 0; | ||
3653 | pa->pa_linear = 1; | ||
3654 | |||
3655 | mb_debug("new group pa %p: %llu/%u for %u\n", pa, | ||
3656 | pa->pa_pstart, pa->pa_len, pa->pa_lstart); | ||
3657 | |||
3658 | ext4_mb_use_group_pa(ac, pa); | ||
3659 | atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); | ||
3660 | |||
3661 | grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); | ||
3662 | lg = ac->ac_lg; | ||
3663 | BUG_ON(lg == NULL); | ||
3664 | |||
3665 | pa->pa_obj_lock = &lg->lg_prealloc_lock; | ||
3666 | pa->pa_inode = NULL; | ||
3667 | |||
3668 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | ||
3669 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | ||
3670 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | ||
3671 | |||
3672 | spin_lock(pa->pa_obj_lock); | ||
3673 | list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); | ||
3674 | spin_unlock(pa->pa_obj_lock); | ||
3675 | |||
3676 | return 0; | ||
3677 | } | ||
3678 | |||
3679 | static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) | ||
3680 | { | ||
3681 | int err; | ||
3682 | |||
3683 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) | ||
3684 | err = ext4_mb_new_group_pa(ac); | ||
3685 | else | ||
3686 | err = ext4_mb_new_inode_pa(ac); | ||
3687 | return err; | ||
3688 | } | ||
3689 | |||
3690 | /* | ||
3691 | * finds all unused blocks in on-disk bitmap, frees them in | ||
3692 | * in-core bitmap and buddy. | ||
3693 | * @pa must be unlinked from inode and group lists, so that | ||
3694 | * nobody else can find/use it. | ||
3695 | * the caller MUST hold group/inode locks. | ||
3696 | * TODO: optimize the case when there are no in-core structures yet | ||
3697 | */ | ||
3698 | static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, | ||
3699 | struct buffer_head *bitmap_bh, | ||
3700 | struct ext4_prealloc_space *pa) | ||
3701 | { | ||
3702 | struct ext4_allocation_context ac; | ||
3703 | struct super_block *sb = e4b->bd_sb; | ||
3704 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
3705 | unsigned long end; | ||
3706 | unsigned long next; | ||
3707 | ext4_group_t group; | ||
3708 | ext4_grpblk_t bit; | ||
3709 | sector_t start; | ||
3710 | int err = 0; | ||
3711 | int free = 0; | ||
3712 | |||
3713 | BUG_ON(pa->pa_deleted == 0); | ||
3714 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | ||
3715 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | ||
3716 | end = bit + pa->pa_len; | ||
3717 | |||
3718 | ac.ac_sb = sb; | ||
3719 | ac.ac_inode = pa->pa_inode; | ||
3720 | ac.ac_op = EXT4_MB_HISTORY_DISCARD; | ||
3721 | |||
3722 | while (bit < end) { | ||
3723 | bit = ext4_find_next_zero_bit(bitmap_bh->b_data, end, bit); | ||
3724 | if (bit >= end) | ||
3725 | break; | ||
3726 | next = ext4_find_next_bit(bitmap_bh->b_data, end, bit); | ||
3727 | if (next > end) | ||
3728 | next = end; | ||
3729 | start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + | ||
3730 | le32_to_cpu(sbi->s_es->s_first_data_block); | ||
3731 | mb_debug(" free preallocated %u/%u in group %u\n", | ||
3732 | (unsigned) start, (unsigned) next - bit, | ||
3733 | (unsigned) group); | ||
3734 | free += next - bit; | ||
3735 | |||
3736 | ac.ac_b_ex.fe_group = group; | ||
3737 | ac.ac_b_ex.fe_start = bit; | ||
3738 | ac.ac_b_ex.fe_len = next - bit; | ||
3739 | ac.ac_b_ex.fe_logical = 0; | ||
3740 | ext4_mb_store_history(&ac); | ||
3741 | |||
3742 | mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); | ||
3743 | bit = next + 1; | ||
3744 | } | ||
3745 | if (free != pa->pa_free) { | ||
3746 | printk(KERN_ERR "pa %p: logic %lu, phys. %lu, len %lu\n", | ||
3747 | pa, (unsigned long) pa->pa_lstart, | ||
3748 | (unsigned long) pa->pa_pstart, | ||
3749 | (unsigned long) pa->pa_len); | ||
3750 | printk(KERN_ERR "free %u, pa_free %u\n", free, pa->pa_free); | ||
3751 | } | ||
3752 | BUG_ON(free != pa->pa_free); | ||
3753 | atomic_add(free, &sbi->s_mb_discarded); | ||
3754 | |||
3755 | return err; | ||
3756 | } | ||
3757 | |||
3758 | static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, | ||
3759 | struct ext4_prealloc_space *pa) | ||
3760 | { | ||
3761 | struct ext4_allocation_context ac; | ||
3762 | struct super_block *sb = e4b->bd_sb; | ||
3763 | ext4_group_t group; | ||
3764 | ext4_grpblk_t bit; | ||
3765 | |||
3766 | ac.ac_op = EXT4_MB_HISTORY_DISCARD; | ||
3767 | |||
3768 | BUG_ON(pa->pa_deleted == 0); | ||
3769 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); | ||
3770 | BUG_ON(group != e4b->bd_group && pa->pa_len != 0); | ||
3771 | mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); | ||
3772 | atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); | ||
3773 | |||
3774 | ac.ac_sb = sb; | ||
3775 | ac.ac_inode = NULL; | ||
3776 | ac.ac_b_ex.fe_group = group; | ||
3777 | ac.ac_b_ex.fe_start = bit; | ||
3778 | ac.ac_b_ex.fe_len = pa->pa_len; | ||
3779 | ac.ac_b_ex.fe_logical = 0; | ||
3780 | ext4_mb_store_history(&ac); | ||
3781 | |||
3782 | return 0; | ||
3783 | } | ||
3784 | |||
3785 | /* | ||
3786 | * releases all preallocations in given group | ||
3787 | * | ||
3788 | * first, we need to decide discard policy: | ||
3789 | * - when do we discard | ||
3790 | * 1) ENOSPC | ||
3791 | * - how many do we discard | ||
3792 | * 1) how many requested | ||
3793 | */ | ||
3794 | static int ext4_mb_discard_group_preallocations(struct super_block *sb, | ||
3795 | ext4_group_t group, int needed) | ||
3796 | { | ||
3797 | struct ext4_group_info *grp = ext4_get_group_info(sb, group); | ||
3798 | struct buffer_head *bitmap_bh = NULL; | ||
3799 | struct ext4_prealloc_space *pa, *tmp; | ||
3800 | struct list_head list; | ||
3801 | struct ext4_buddy e4b; | ||
3802 | int err; | ||
3803 | int busy = 0; | ||
3804 | int free = 0; | ||
3805 | |||
3806 | mb_debug("discard preallocation for group %lu\n", group); | ||
3807 | |||
3808 | if (list_empty(&grp->bb_prealloc_list)) | ||
3809 | return 0; | ||
3810 | |||
3811 | bitmap_bh = read_block_bitmap(sb, group); | ||
3812 | if (bitmap_bh == NULL) { | ||
3813 | /* error handling here */ | ||
3814 | ext4_mb_release_desc(&e4b); | ||
3815 | BUG_ON(bitmap_bh == NULL); | ||
3816 | } | ||
3817 | |||
3818 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
3819 | BUG_ON(err != 0); /* error handling here */ | ||
3820 | |||
3821 | if (needed == 0) | ||
3822 | needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; | ||
3823 | |||
3824 | grp = ext4_get_group_info(sb, group); | ||
3825 | INIT_LIST_HEAD(&list); | ||
3826 | |||
3827 | repeat: | ||
3828 | ext4_lock_group(sb, group); | ||
3829 | list_for_each_entry_safe(pa, tmp, | ||
3830 | &grp->bb_prealloc_list, pa_group_list) { | ||
3831 | spin_lock(&pa->pa_lock); | ||
3832 | if (atomic_read(&pa->pa_count)) { | ||
3833 | spin_unlock(&pa->pa_lock); | ||
3834 | busy = 1; | ||
3835 | continue; | ||
3836 | } | ||
3837 | if (pa->pa_deleted) { | ||
3838 | spin_unlock(&pa->pa_lock); | ||
3839 | continue; | ||
3840 | } | ||
3841 | |||
3842 | /* seems this one can be freed ... */ | ||
3843 | pa->pa_deleted = 1; | ||
3844 | |||
3845 | /* we can trust pa_free ... */ | ||
3846 | free += pa->pa_free; | ||
3847 | |||
3848 | spin_unlock(&pa->pa_lock); | ||
3849 | |||
3850 | list_del(&pa->pa_group_list); | ||
3851 | list_add(&pa->u.pa_tmp_list, &list); | ||
3852 | } | ||
3853 | |||
3854 | /* if we still need more blocks and some PAs were used, try again */ | ||
3855 | if (free < needed && busy) { | ||
3856 | busy = 0; | ||
3857 | ext4_unlock_group(sb, group); | ||
3858 | /* | ||
3859 | * Yield the CPU here so that we don't get soft lockup | ||
3860 | * in non preempt case. | ||
3861 | */ | ||
3862 | yield(); | ||
3863 | goto repeat; | ||
3864 | } | ||
3865 | |||
3866 | /* found anything to free? */ | ||
3867 | if (list_empty(&list)) { | ||
3868 | BUG_ON(free != 0); | ||
3869 | goto out; | ||
3870 | } | ||
3871 | |||
3872 | /* now free all selected PAs */ | ||
3873 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { | ||
3874 | |||
3875 | /* remove from object (inode or locality group) */ | ||
3876 | spin_lock(pa->pa_obj_lock); | ||
3877 | list_del_rcu(&pa->pa_inode_list); | ||
3878 | spin_unlock(pa->pa_obj_lock); | ||
3879 | |||
3880 | if (pa->pa_linear) | ||
3881 | ext4_mb_release_group_pa(&e4b, pa); | ||
3882 | else | ||
3883 | ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); | ||
3884 | |||
3885 | list_del(&pa->u.pa_tmp_list); | ||
3886 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3887 | } | ||
3888 | |||
3889 | out: | ||
3890 | ext4_unlock_group(sb, group); | ||
3891 | ext4_mb_release_desc(&e4b); | ||
3892 | put_bh(bitmap_bh); | ||
3893 | return free; | ||
3894 | } | ||
3895 | |||
3896 | /* | ||
3897 | * releases all non-used preallocated blocks for given inode | ||
3898 | * | ||
3899 | * It's important to discard preallocations under i_data_sem | ||
3900 | * We don't want another block to be served from the prealloc | ||
3901 | * space when we are discarding the inode prealloc space. | ||
3902 | * | ||
3903 | * FIXME!! Make sure it is valid at all the call sites | ||
3904 | */ | ||
3905 | void ext4_mb_discard_inode_preallocations(struct inode *inode) | ||
3906 | { | ||
3907 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3908 | struct super_block *sb = inode->i_sb; | ||
3909 | struct buffer_head *bitmap_bh = NULL; | ||
3910 | struct ext4_prealloc_space *pa, *tmp; | ||
3911 | ext4_group_t group = 0; | ||
3912 | struct list_head list; | ||
3913 | struct ext4_buddy e4b; | ||
3914 | int err; | ||
3915 | |||
3916 | if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { | ||
3917 | /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ | ||
3918 | return; | ||
3919 | } | ||
3920 | |||
3921 | mb_debug("discard preallocation for inode %lu\n", inode->i_ino); | ||
3922 | |||
3923 | INIT_LIST_HEAD(&list); | ||
3924 | |||
3925 | repeat: | ||
3926 | /* first, collect all pa's in the inode */ | ||
3927 | spin_lock(&ei->i_prealloc_lock); | ||
3928 | while (!list_empty(&ei->i_prealloc_list)) { | ||
3929 | pa = list_entry(ei->i_prealloc_list.next, | ||
3930 | struct ext4_prealloc_space, pa_inode_list); | ||
3931 | BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); | ||
3932 | spin_lock(&pa->pa_lock); | ||
3933 | if (atomic_read(&pa->pa_count)) { | ||
3934 | /* this shouldn't happen often - nobody should | ||
3935 | * use preallocation while we're discarding it */ | ||
3936 | spin_unlock(&pa->pa_lock); | ||
3937 | spin_unlock(&ei->i_prealloc_lock); | ||
3938 | printk(KERN_ERR "uh-oh! used pa while discarding\n"); | ||
3939 | WARN_ON(1); | ||
3940 | schedule_timeout_uninterruptible(HZ); | ||
3941 | goto repeat; | ||
3942 | |||
3943 | } | ||
3944 | if (pa->pa_deleted == 0) { | ||
3945 | pa->pa_deleted = 1; | ||
3946 | spin_unlock(&pa->pa_lock); | ||
3947 | list_del_rcu(&pa->pa_inode_list); | ||
3948 | list_add(&pa->u.pa_tmp_list, &list); | ||
3949 | continue; | ||
3950 | } | ||
3951 | |||
3952 | /* someone is deleting pa right now */ | ||
3953 | spin_unlock(&pa->pa_lock); | ||
3954 | spin_unlock(&ei->i_prealloc_lock); | ||
3955 | |||
3956 | /* we have to wait here because pa_deleted | ||
3957 | * doesn't mean pa is already unlinked from | ||
3958 | * the list. as we might be called from | ||
3959 | * ->clear_inode() the inode will get freed | ||
3960 | * and concurrent thread which is unlinking | ||
3961 | * pa from inode's list may access already | ||
3962 | * freed memory, bad-bad-bad */ | ||
3963 | |||
3964 | /* XXX: if this happens too often, we can | ||
3965 | * add a flag to force wait only in case | ||
3966 | * of ->clear_inode(), but not in case of | ||
3967 | * regular truncate */ | ||
3968 | schedule_timeout_uninterruptible(HZ); | ||
3969 | goto repeat; | ||
3970 | } | ||
3971 | spin_unlock(&ei->i_prealloc_lock); | ||
3972 | |||
3973 | list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { | ||
3974 | BUG_ON(pa->pa_linear != 0); | ||
3975 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | ||
3976 | |||
3977 | err = ext4_mb_load_buddy(sb, group, &e4b); | ||
3978 | BUG_ON(err != 0); /* error handling here */ | ||
3979 | |||
3980 | bitmap_bh = read_block_bitmap(sb, group); | ||
3981 | if (bitmap_bh == NULL) { | ||
3982 | /* error handling here */ | ||
3983 | ext4_mb_release_desc(&e4b); | ||
3984 | BUG_ON(bitmap_bh == NULL); | ||
3985 | } | ||
3986 | |||
3987 | ext4_lock_group(sb, group); | ||
3988 | list_del(&pa->pa_group_list); | ||
3989 | ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); | ||
3990 | ext4_unlock_group(sb, group); | ||
3991 | |||
3992 | ext4_mb_release_desc(&e4b); | ||
3993 | put_bh(bitmap_bh); | ||
3994 | |||
3995 | list_del(&pa->u.pa_tmp_list); | ||
3996 | call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | ||
3997 | } | ||
3998 | } | ||
3999 | |||
4000 | /* | ||
4001 | * finds all preallocated spaces and return blocks being freed to them | ||
4002 | * if preallocated space becomes full (no block is used from the space) | ||
4003 | * then the function frees space in buddy | ||
4004 | * XXX: at the moment, truncate (which is the only way to free blocks) | ||
4005 | * discards all preallocations | ||
4006 | */ | ||
4007 | static void ext4_mb_return_to_preallocation(struct inode *inode, | ||
4008 | struct ext4_buddy *e4b, | ||
4009 | sector_t block, int count) | ||
4010 | { | ||
4011 | BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); | ||
4012 | } | ||
4013 | #ifdef MB_DEBUG | ||
4014 | static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | ||
4015 | { | ||
4016 | struct super_block *sb = ac->ac_sb; | ||
4017 | ext4_group_t i; | ||
4018 | |||
4019 | printk(KERN_ERR "EXT4-fs: Can't allocate:" | ||
4020 | " Allocation context details:\n"); | ||
4021 | printk(KERN_ERR "EXT4-fs: status %d flags %d\n", | ||
4022 | ac->ac_status, ac->ac_flags); | ||
4023 | printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " | ||
4024 | "best %lu/%lu/%lu@%lu cr %d\n", | ||
4025 | (unsigned long)ac->ac_o_ex.fe_group, | ||
4026 | (unsigned long)ac->ac_o_ex.fe_start, | ||
4027 | (unsigned long)ac->ac_o_ex.fe_len, | ||
4028 | (unsigned long)ac->ac_o_ex.fe_logical, | ||
4029 | (unsigned long)ac->ac_g_ex.fe_group, | ||
4030 | (unsigned long)ac->ac_g_ex.fe_start, | ||
4031 | (unsigned long)ac->ac_g_ex.fe_len, | ||
4032 | (unsigned long)ac->ac_g_ex.fe_logical, | ||
4033 | (unsigned long)ac->ac_b_ex.fe_group, | ||
4034 | (unsigned long)ac->ac_b_ex.fe_start, | ||
4035 | (unsigned long)ac->ac_b_ex.fe_len, | ||
4036 | (unsigned long)ac->ac_b_ex.fe_logical, | ||
4037 | (int)ac->ac_criteria); | ||
4038 | printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, | ||
4039 | ac->ac_found); | ||
4040 | printk(KERN_ERR "EXT4-fs: groups: \n"); | ||
4041 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { | ||
4042 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); | ||
4043 | struct ext4_prealloc_space *pa; | ||
4044 | ext4_grpblk_t start; | ||
4045 | struct list_head *cur; | ||
4046 | ext4_lock_group(sb, i); | ||
4047 | list_for_each(cur, &grp->bb_prealloc_list) { | ||
4048 | pa = list_entry(cur, struct ext4_prealloc_space, | ||
4049 | pa_group_list); | ||
4050 | spin_lock(&pa->pa_lock); | ||
4051 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, | ||
4052 | NULL, &start); | ||
4053 | spin_unlock(&pa->pa_lock); | ||
4054 | printk(KERN_ERR "PA:%lu:%d:%u \n", i, | ||
4055 | start, pa->pa_len); | ||
4056 | } | ||
4057 | ext4_lock_group(sb, i); | ||
4058 | |||
4059 | if (grp->bb_free == 0) | ||
4060 | continue; | ||
4061 | printk(KERN_ERR "%lu: %d/%d \n", | ||
4062 | i, grp->bb_free, grp->bb_fragments); | ||
4063 | } | ||
4064 | printk(KERN_ERR "\n"); | ||
4065 | } | ||
4066 | #else | ||
4067 | static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) | ||
4068 | { | ||
4069 | return; | ||
4070 | } | ||
4071 | #endif | ||
4072 | |||
4073 | /* | ||
4074 | * We use locality group preallocation for small size file. The size of the | ||
4075 | * file is determined by the current size or the resulting size after | ||
4076 | * allocation which ever is larger | ||
4077 | * | ||
4078 | * One can tune this size via /proc/fs/ext4/<partition>/stream_req | ||
4079 | */ | ||
4080 | static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) | ||
4081 | { | ||
4082 | struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); | ||
4083 | int bsbits = ac->ac_sb->s_blocksize_bits; | ||
4084 | loff_t size, isize; | ||
4085 | |||
4086 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | ||
4087 | return; | ||
4088 | |||
4089 | size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; | ||
4090 | isize = i_size_read(ac->ac_inode) >> bsbits; | ||
4091 | size = max(size, isize); | ||
4092 | |||
4093 | /* don't use group allocation for large files */ | ||
4094 | if (size >= sbi->s_mb_stream_request) | ||
4095 | return; | ||
4096 | |||
4097 | if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) | ||
4098 | return; | ||
4099 | |||
4100 | BUG_ON(ac->ac_lg != NULL); | ||
4101 | /* | ||
4102 | * locality group prealloc space are per cpu. The reason for having | ||
4103 | * per cpu locality group is to reduce the contention between block | ||
4104 | * request from multiple CPUs. | ||
4105 | */ | ||
4106 | ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; | ||
4107 | put_cpu(); | ||
4108 | |||
4109 | /* we're going to use group allocation */ | ||
4110 | ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; | ||
4111 | |||
4112 | /* serialize all allocations in the group */ | ||
4113 | mutex_lock(&ac->ac_lg->lg_mutex); | ||
4114 | } | ||
4115 | |||
4116 | static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, | ||
4117 | struct ext4_allocation_request *ar) | ||
4118 | { | ||
4119 | struct super_block *sb = ar->inode->i_sb; | ||
4120 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4121 | struct ext4_super_block *es = sbi->s_es; | ||
4122 | ext4_group_t group; | ||
4123 | unsigned long len; | ||
4124 | unsigned long goal; | ||
4125 | ext4_grpblk_t block; | ||
4126 | |||
4127 | /* we can't allocate > group size */ | ||
4128 | len = ar->len; | ||
4129 | |||
4130 | /* just a dirty hack to filter too big requests */ | ||
4131 | if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) | ||
4132 | len = EXT4_BLOCKS_PER_GROUP(sb) - 10; | ||
4133 | |||
4134 | /* start searching from the goal */ | ||
4135 | goal = ar->goal; | ||
4136 | if (goal < le32_to_cpu(es->s_first_data_block) || | ||
4137 | goal >= ext4_blocks_count(es)) | ||
4138 | goal = le32_to_cpu(es->s_first_data_block); | ||
4139 | ext4_get_group_no_and_offset(sb, goal, &group, &block); | ||
4140 | |||
4141 | /* set up allocation goals */ | ||
4142 | ac->ac_b_ex.fe_logical = ar->logical; | ||
4143 | ac->ac_b_ex.fe_group = 0; | ||
4144 | ac->ac_b_ex.fe_start = 0; | ||
4145 | ac->ac_b_ex.fe_len = 0; | ||
4146 | ac->ac_status = AC_STATUS_CONTINUE; | ||
4147 | ac->ac_groups_scanned = 0; | ||
4148 | ac->ac_ex_scanned = 0; | ||
4149 | ac->ac_found = 0; | ||
4150 | ac->ac_sb = sb; | ||
4151 | ac->ac_inode = ar->inode; | ||
4152 | ac->ac_o_ex.fe_logical = ar->logical; | ||
4153 | ac->ac_o_ex.fe_group = group; | ||
4154 | ac->ac_o_ex.fe_start = block; | ||
4155 | ac->ac_o_ex.fe_len = len; | ||
4156 | ac->ac_g_ex.fe_logical = ar->logical; | ||
4157 | ac->ac_g_ex.fe_group = group; | ||
4158 | ac->ac_g_ex.fe_start = block; | ||
4159 | ac->ac_g_ex.fe_len = len; | ||
4160 | ac->ac_f_ex.fe_len = 0; | ||
4161 | ac->ac_flags = ar->flags; | ||
4162 | ac->ac_2order = 0; | ||
4163 | ac->ac_criteria = 0; | ||
4164 | ac->ac_pa = NULL; | ||
4165 | ac->ac_bitmap_page = NULL; | ||
4166 | ac->ac_buddy_page = NULL; | ||
4167 | ac->ac_lg = NULL; | ||
4168 | |||
4169 | /* we have to define context: we'll we work with a file or | ||
4170 | * locality group. this is a policy, actually */ | ||
4171 | ext4_mb_group_or_file(ac); | ||
4172 | |||
4173 | mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " | ||
4174 | "left: %u/%u, right %u/%u to %swritable\n", | ||
4175 | (unsigned) ar->len, (unsigned) ar->logical, | ||
4176 | (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, | ||
4177 | (unsigned) ar->lleft, (unsigned) ar->pleft, | ||
4178 | (unsigned) ar->lright, (unsigned) ar->pright, | ||
4179 | atomic_read(&ar->inode->i_writecount) ? "" : "non-"); | ||
4180 | return 0; | ||
4181 | |||
4182 | } | ||
4183 | |||
4184 | /* | ||
4185 | * release all resource we used in allocation | ||
4186 | */ | ||
4187 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) | ||
4188 | { | ||
4189 | if (ac->ac_pa) { | ||
4190 | if (ac->ac_pa->pa_linear) { | ||
4191 | /* see comment in ext4_mb_use_group_pa() */ | ||
4192 | spin_lock(&ac->ac_pa->pa_lock); | ||
4193 | ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; | ||
4194 | ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; | ||
4195 | ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; | ||
4196 | ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; | ||
4197 | spin_unlock(&ac->ac_pa->pa_lock); | ||
4198 | } | ||
4199 | ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); | ||
4200 | } | ||
4201 | if (ac->ac_bitmap_page) | ||
4202 | page_cache_release(ac->ac_bitmap_page); | ||
4203 | if (ac->ac_buddy_page) | ||
4204 | page_cache_release(ac->ac_buddy_page); | ||
4205 | if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) | ||
4206 | mutex_unlock(&ac->ac_lg->lg_mutex); | ||
4207 | ext4_mb_collect_stats(ac); | ||
4208 | return 0; | ||
4209 | } | ||
4210 | |||
4211 | static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) | ||
4212 | { | ||
4213 | ext4_group_t i; | ||
4214 | int ret; | ||
4215 | int freed = 0; | ||
4216 | |||
4217 | for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { | ||
4218 | ret = ext4_mb_discard_group_preallocations(sb, i, needed); | ||
4219 | freed += ret; | ||
4220 | needed -= ret; | ||
4221 | } | ||
4222 | |||
4223 | return freed; | ||
4224 | } | ||
4225 | |||
4226 | /* | ||
4227 | * Main entry point into mballoc to allocate blocks | ||
4228 | * it tries to use preallocation first, then falls back | ||
4229 | * to usual allocation | ||
4230 | */ | ||
4231 | ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | ||
4232 | struct ext4_allocation_request *ar, int *errp) | ||
4233 | { | ||
4234 | struct ext4_allocation_context ac; | ||
4235 | struct ext4_sb_info *sbi; | ||
4236 | struct super_block *sb; | ||
4237 | ext4_fsblk_t block = 0; | ||
4238 | int freed; | ||
4239 | int inquota; | ||
4240 | |||
4241 | sb = ar->inode->i_sb; | ||
4242 | sbi = EXT4_SB(sb); | ||
4243 | |||
4244 | if (!test_opt(sb, MBALLOC)) { | ||
4245 | block = ext4_new_blocks_old(handle, ar->inode, ar->goal, | ||
4246 | &(ar->len), errp); | ||
4247 | return block; | ||
4248 | } | ||
4249 | |||
4250 | while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { | ||
4251 | ar->flags |= EXT4_MB_HINT_NOPREALLOC; | ||
4252 | ar->len--; | ||
4253 | } | ||
4254 | if (ar->len == 0) { | ||
4255 | *errp = -EDQUOT; | ||
4256 | return 0; | ||
4257 | } | ||
4258 | inquota = ar->len; | ||
4259 | |||
4260 | ext4_mb_poll_new_transaction(sb, handle); | ||
4261 | |||
4262 | *errp = ext4_mb_initialize_context(&ac, ar); | ||
4263 | if (*errp) { | ||
4264 | ar->len = 0; | ||
4265 | goto out; | ||
4266 | } | ||
4267 | |||
4268 | ac.ac_op = EXT4_MB_HISTORY_PREALLOC; | ||
4269 | if (!ext4_mb_use_preallocated(&ac)) { | ||
4270 | |||
4271 | ac.ac_op = EXT4_MB_HISTORY_ALLOC; | ||
4272 | ext4_mb_normalize_request(&ac, ar); | ||
4273 | |||
4274 | repeat: | ||
4275 | /* allocate space in core */ | ||
4276 | ext4_mb_regular_allocator(&ac); | ||
4277 | |||
4278 | /* as we've just preallocated more space than | ||
4279 | * user requested orinally, we store allocated | ||
4280 | * space in a special descriptor */ | ||
4281 | if (ac.ac_status == AC_STATUS_FOUND && | ||
4282 | ac.ac_o_ex.fe_len < ac.ac_b_ex.fe_len) | ||
4283 | ext4_mb_new_preallocation(&ac); | ||
4284 | } | ||
4285 | |||
4286 | if (likely(ac.ac_status == AC_STATUS_FOUND)) { | ||
4287 | ext4_mb_mark_diskspace_used(&ac, handle); | ||
4288 | *errp = 0; | ||
4289 | block = ext4_grp_offs_to_block(sb, &ac.ac_b_ex); | ||
4290 | ar->len = ac.ac_b_ex.fe_len; | ||
4291 | } else { | ||
4292 | freed = ext4_mb_discard_preallocations(sb, ac.ac_o_ex.fe_len); | ||
4293 | if (freed) | ||
4294 | goto repeat; | ||
4295 | *errp = -ENOSPC; | ||
4296 | ac.ac_b_ex.fe_len = 0; | ||
4297 | ar->len = 0; | ||
4298 | ext4_mb_show_ac(&ac); | ||
4299 | } | ||
4300 | |||
4301 | ext4_mb_release_context(&ac); | ||
4302 | |||
4303 | out: | ||
4304 | if (ar->len < inquota) | ||
4305 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); | ||
4306 | |||
4307 | return block; | ||
4308 | } | ||
4309 | static void ext4_mb_poll_new_transaction(struct super_block *sb, | ||
4310 | handle_t *handle) | ||
4311 | { | ||
4312 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4313 | |||
4314 | if (sbi->s_last_transaction == handle->h_transaction->t_tid) | ||
4315 | return; | ||
4316 | |||
4317 | /* new transaction! time to close last one and free blocks for | ||
4318 | * committed transaction. we know that only transaction can be | ||
4319 | * active, so previos transaction can be being logged and we | ||
4320 | * know that transaction before previous is known to be already | ||
4321 | * logged. this means that now we may free blocks freed in all | ||
4322 | * transactions before previous one. hope I'm clear enough ... */ | ||
4323 | |||
4324 | spin_lock(&sbi->s_md_lock); | ||
4325 | if (sbi->s_last_transaction != handle->h_transaction->t_tid) { | ||
4326 | mb_debug("new transaction %lu, old %lu\n", | ||
4327 | (unsigned long) handle->h_transaction->t_tid, | ||
4328 | (unsigned long) sbi->s_last_transaction); | ||
4329 | list_splice_init(&sbi->s_closed_transaction, | ||
4330 | &sbi->s_committed_transaction); | ||
4331 | list_splice_init(&sbi->s_active_transaction, | ||
4332 | &sbi->s_closed_transaction); | ||
4333 | sbi->s_last_transaction = handle->h_transaction->t_tid; | ||
4334 | } | ||
4335 | spin_unlock(&sbi->s_md_lock); | ||
4336 | |||
4337 | ext4_mb_free_committed_blocks(sb); | ||
4338 | } | ||
4339 | |||
4340 | static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, | ||
4341 | ext4_group_t group, ext4_grpblk_t block, int count) | ||
4342 | { | ||
4343 | struct ext4_group_info *db = e4b->bd_info; | ||
4344 | struct super_block *sb = e4b->bd_sb; | ||
4345 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
4346 | struct ext4_free_metadata *md; | ||
4347 | int i; | ||
4348 | |||
4349 | BUG_ON(e4b->bd_bitmap_page == NULL); | ||
4350 | BUG_ON(e4b->bd_buddy_page == NULL); | ||
4351 | |||
4352 | ext4_lock_group(sb, group); | ||
4353 | for (i = 0; i < count; i++) { | ||
4354 | md = db->bb_md_cur; | ||
4355 | if (md && db->bb_tid != handle->h_transaction->t_tid) { | ||
4356 | db->bb_md_cur = NULL; | ||
4357 | md = NULL; | ||
4358 | } | ||
4359 | |||
4360 | if (md == NULL) { | ||
4361 | ext4_unlock_group(sb, group); | ||
4362 | md = kmalloc(sizeof(*md), GFP_NOFS); | ||
4363 | if (md == NULL) | ||
4364 | return -ENOMEM; | ||
4365 | md->num = 0; | ||
4366 | md->group = group; | ||
4367 | |||
4368 | ext4_lock_group(sb, group); | ||
4369 | if (db->bb_md_cur == NULL) { | ||
4370 | spin_lock(&sbi->s_md_lock); | ||
4371 | list_add(&md->list, &sbi->s_active_transaction); | ||
4372 | spin_unlock(&sbi->s_md_lock); | ||
4373 | /* protect buddy cache from being freed, | ||
4374 | * otherwise we'll refresh it from | ||
4375 | * on-disk bitmap and lose not-yet-available | ||
4376 | * blocks */ | ||
4377 | page_cache_get(e4b->bd_buddy_page); | ||
4378 | page_cache_get(e4b->bd_bitmap_page); | ||
4379 | db->bb_md_cur = md; | ||
4380 | db->bb_tid = handle->h_transaction->t_tid; | ||
4381 | mb_debug("new md 0x%p for group %lu\n", | ||
4382 | md, md->group); | ||
4383 | } else { | ||
4384 | kfree(md); | ||
4385 | md = db->bb_md_cur; | ||
4386 | } | ||
4387 | } | ||
4388 | |||
4389 | BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); | ||
4390 | md->blocks[md->num] = block + i; | ||
4391 | md->num++; | ||
4392 | if (md->num == EXT4_BB_MAX_BLOCKS) { | ||
4393 | /* no more space, put full container on a sb's list */ | ||
4394 | db->bb_md_cur = NULL; | ||
4395 | } | ||
4396 | } | ||
4397 | ext4_unlock_group(sb, group); | ||
4398 | return 0; | ||
4399 | } | ||
4400 | |||
4401 | /* | ||
4402 | * Main entry point into mballoc to free blocks | ||
4403 | */ | ||
4404 | void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, | ||
4405 | unsigned long block, unsigned long count, | ||
4406 | int metadata, unsigned long *freed) | ||
4407 | { | ||
4408 | struct buffer_head *bitmap_bh = 0; | ||
4409 | struct super_block *sb = inode->i_sb; | ||
4410 | struct ext4_allocation_context ac; | ||
4411 | struct ext4_group_desc *gdp; | ||
4412 | struct ext4_super_block *es; | ||
4413 | unsigned long overflow; | ||
4414 | ext4_grpblk_t bit; | ||
4415 | struct buffer_head *gd_bh; | ||
4416 | ext4_group_t block_group; | ||
4417 | struct ext4_sb_info *sbi; | ||
4418 | struct ext4_buddy e4b; | ||
4419 | int err = 0; | ||
4420 | int ret; | ||
4421 | |||
4422 | *freed = 0; | ||
4423 | |||
4424 | ext4_mb_poll_new_transaction(sb, handle); | ||
4425 | |||
4426 | sbi = EXT4_SB(sb); | ||
4427 | es = EXT4_SB(sb)->s_es; | ||
4428 | if (block < le32_to_cpu(es->s_first_data_block) || | ||
4429 | block + count < block || | ||
4430 | block + count > ext4_blocks_count(es)) { | ||
4431 | ext4_error(sb, __FUNCTION__, | ||
4432 | "Freeing blocks not in datazone - " | ||
4433 | "block = %lu, count = %lu", block, count); | ||
4434 | goto error_return; | ||
4435 | } | ||
4436 | |||
4437 | ext4_debug("freeing block %lu\n", block); | ||
4438 | |||
4439 | ac.ac_op = EXT4_MB_HISTORY_FREE; | ||
4440 | ac.ac_inode = inode; | ||
4441 | ac.ac_sb = sb; | ||
4442 | |||
4443 | do_more: | ||
4444 | overflow = 0; | ||
4445 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | ||
4446 | |||
4447 | /* | ||
4448 | * Check to see if we are freeing blocks across a group | ||
4449 | * boundary. | ||
4450 | */ | ||
4451 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { | ||
4452 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); | ||
4453 | count -= overflow; | ||
4454 | } | ||
4455 | bitmap_bh = read_block_bitmap(sb, block_group); | ||
4456 | if (!bitmap_bh) | ||
4457 | goto error_return; | ||
4458 | gdp = ext4_get_group_desc(sb, block_group, &gd_bh); | ||
4459 | if (!gdp) | ||
4460 | goto error_return; | ||
4461 | |||
4462 | if (in_range(ext4_block_bitmap(sb, gdp), block, count) || | ||
4463 | in_range(ext4_inode_bitmap(sb, gdp), block, count) || | ||
4464 | in_range(block, ext4_inode_table(sb, gdp), | ||
4465 | EXT4_SB(sb)->s_itb_per_group) || | ||
4466 | in_range(block + count - 1, ext4_inode_table(sb, gdp), | ||
4467 | EXT4_SB(sb)->s_itb_per_group)) { | ||
4468 | |||
4469 | ext4_error(sb, __FUNCTION__, | ||
4470 | "Freeing blocks in system zone - " | ||
4471 | "Block = %lu, count = %lu", block, count); | ||
4472 | } | ||
4473 | |||
4474 | BUFFER_TRACE(bitmap_bh, "getting write access"); | ||
4475 | err = ext4_journal_get_write_access(handle, bitmap_bh); | ||
4476 | if (err) | ||
4477 | goto error_return; | ||
4478 | |||
4479 | /* | ||
4480 | * We are about to modify some metadata. Call the journal APIs | ||
4481 | * to unshare ->b_data if a currently-committing transaction is | ||
4482 | * using it | ||
4483 | */ | ||
4484 | BUFFER_TRACE(gd_bh, "get_write_access"); | ||
4485 | err = ext4_journal_get_write_access(handle, gd_bh); | ||
4486 | if (err) | ||
4487 | goto error_return; | ||
4488 | |||
4489 | err = ext4_mb_load_buddy(sb, block_group, &e4b); | ||
4490 | if (err) | ||
4491 | goto error_return; | ||
4492 | |||
4493 | #ifdef AGGRESSIVE_CHECK | ||
4494 | { | ||
4495 | int i; | ||
4496 | for (i = 0; i < count; i++) | ||
4497 | BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); | ||
4498 | } | ||
4499 | #endif | ||
4500 | mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, | ||
4501 | bit, count); | ||
4502 | |||
4503 | /* We dirtied the bitmap block */ | ||
4504 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | ||
4505 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | ||
4506 | |||
4507 | ac.ac_b_ex.fe_group = block_group; | ||
4508 | ac.ac_b_ex.fe_start = bit; | ||
4509 | ac.ac_b_ex.fe_len = count; | ||
4510 | ext4_mb_store_history(&ac); | ||
4511 | |||
4512 | if (metadata) { | ||
4513 | /* blocks being freed are metadata. these blocks shouldn't | ||
4514 | * be used until this transaction is committed */ | ||
4515 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | ||
4516 | } else { | ||
4517 | ext4_lock_group(sb, block_group); | ||
4518 | err = mb_free_blocks(inode, &e4b, bit, count); | ||
4519 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | ||
4520 | ext4_unlock_group(sb, block_group); | ||
4521 | BUG_ON(err != 0); | ||
4522 | } | ||
4523 | |||
4524 | spin_lock(sb_bgl_lock(sbi, block_group)); | ||
4525 | gdp->bg_free_blocks_count = | ||
4526 | cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); | ||
4527 | gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); | ||
4528 | spin_unlock(sb_bgl_lock(sbi, block_group)); | ||
4529 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | ||
4530 | |||
4531 | ext4_mb_release_desc(&e4b); | ||
4532 | |||
4533 | *freed += count; | ||
4534 | |||
4535 | /* And the group descriptor block */ | ||
4536 | BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); | ||
4537 | ret = ext4_journal_dirty_metadata(handle, gd_bh); | ||
4538 | if (!err) | ||
4539 | err = ret; | ||
4540 | |||
4541 | if (overflow && !err) { | ||
4542 | block += count; | ||
4543 | count = overflow; | ||
4544 | put_bh(bitmap_bh); | ||
4545 | goto do_more; | ||
4546 | } | ||
4547 | sb->s_dirt = 1; | ||
4548 | error_return: | ||
4549 | brelse(bitmap_bh); | ||
4550 | ext4_std_error(sb, err); | ||
4551 | return; | ||
4552 | } | ||