diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/Makefile | 2 | ||||
-rw-r--r-- | fs/ext4/acl.c | 98 | ||||
-rw-r--r-- | fs/ext4/acl.h | 4 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 48 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 21 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 61 | ||||
-rw-r--r-- | fs/ext4/ext4_jbd2.h | 4 | ||||
-rw-r--r-- | fs/ext4/extents.c | 129 | ||||
-rw-r--r-- | fs/ext4/file.c | 23 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 64 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 1487 | ||||
-rw-r--r-- | fs/ext4/inode.c | 1762 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 12 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 230 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 1 | ||||
-rw-r--r-- | fs/ext4/namei.c | 56 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 42 | ||||
-rw-r--r-- | fs/ext4/resize.c | 199 | ||||
-rw-r--r-- | fs/ext4/super.c | 117 | ||||
-rw-r--r-- | fs/ext4/truncate.h | 43 | ||||
-rw-r--r-- | fs/ext4/xattr.c | 6 |
22 files changed, 2355 insertions, 2056 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9..56fd8f86593 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile | |||
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o | |||
7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ | 7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ |
8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ | 8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ |
9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ | 9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ |
10 | mmp.o | 10 | mmp.o indirect.o |
11 | 11 | ||
12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o | 12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o |
13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o | 13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o |
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 21eacd7b7d7..a5c29bb3b83 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c | |||
@@ -131,7 +131,7 @@ fail: | |||
131 | * | 131 | * |
132 | * inode->i_mutex: don't care | 132 | * inode->i_mutex: don't care |
133 | */ | 133 | */ |
134 | static struct posix_acl * | 134 | struct posix_acl * |
135 | ext4_get_acl(struct inode *inode, int type) | 135 | ext4_get_acl(struct inode *inode, int type) |
136 | { | 136 | { |
137 | int name_index; | 137 | int name_index; |
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |||
198 | case ACL_TYPE_ACCESS: | 198 | case ACL_TYPE_ACCESS: |
199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | 199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; |
200 | if (acl) { | 200 | if (acl) { |
201 | mode_t mode = inode->i_mode; | 201 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
202 | error = posix_acl_equiv_mode(acl, &mode); | ||
203 | if (error < 0) | 202 | if (error < 0) |
204 | return error; | 203 | return error; |
205 | else { | 204 | else { |
206 | inode->i_mode = mode; | ||
207 | inode->i_ctime = ext4_current_time(inode); | 205 | inode->i_ctime = ext4_current_time(inode); |
208 | ext4_mark_inode_dirty(handle, inode); | 206 | ext4_mark_inode_dirty(handle, inode); |
209 | if (error == 0) | 207 | if (error == 0) |
@@ -237,29 +235,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |||
237 | return error; | 235 | return error; |
238 | } | 236 | } |
239 | 237 | ||
240 | int | ||
241 | ext4_check_acl(struct inode *inode, int mask, unsigned int flags) | ||
242 | { | ||
243 | struct posix_acl *acl; | ||
244 | |||
245 | if (flags & IPERM_FLAG_RCU) { | ||
246 | if (!negative_cached_acl(inode, ACL_TYPE_ACCESS)) | ||
247 | return -ECHILD; | ||
248 | return -EAGAIN; | ||
249 | } | ||
250 | |||
251 | acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); | ||
252 | if (IS_ERR(acl)) | ||
253 | return PTR_ERR(acl); | ||
254 | if (acl) { | ||
255 | int error = posix_acl_permission(inode, acl, mask); | ||
256 | posix_acl_release(acl); | ||
257 | return error; | ||
258 | } | ||
259 | |||
260 | return -EAGAIN; | ||
261 | } | ||
262 | |||
263 | /* | 238 | /* |
264 | * Initialize the ACLs of a new inode. Called from ext4_new_inode. | 239 | * Initialize the ACLs of a new inode. Called from ext4_new_inode. |
265 | * | 240 | * |
@@ -282,31 +257,20 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
282 | inode->i_mode &= ~current_umask(); | 257 | inode->i_mode &= ~current_umask(); |
283 | } | 258 | } |
284 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { | 259 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { |
285 | struct posix_acl *clone; | ||
286 | mode_t mode; | ||
287 | |||
288 | if (S_ISDIR(inode->i_mode)) { | 260 | if (S_ISDIR(inode->i_mode)) { |
289 | error = ext4_set_acl(handle, inode, | 261 | error = ext4_set_acl(handle, inode, |
290 | ACL_TYPE_DEFAULT, acl); | 262 | ACL_TYPE_DEFAULT, acl); |
291 | if (error) | 263 | if (error) |
292 | goto cleanup; | 264 | goto cleanup; |
293 | } | 265 | } |
294 | clone = posix_acl_clone(acl, GFP_NOFS); | 266 | error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
295 | error = -ENOMEM; | 267 | if (error < 0) |
296 | if (!clone) | 268 | return error; |
297 | goto cleanup; | 269 | |
298 | 270 | if (error > 0) { | |
299 | mode = inode->i_mode; | 271 | /* This is an extended ACL */ |
300 | error = posix_acl_create_masq(clone, &mode); | 272 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); |
301 | if (error >= 0) { | ||
302 | inode->i_mode = mode; | ||
303 | if (error > 0) { | ||
304 | /* This is an extended ACL */ | ||
305 | error = ext4_set_acl(handle, inode, | ||
306 | ACL_TYPE_ACCESS, clone); | ||
307 | } | ||
308 | } | 273 | } |
309 | posix_acl_release(clone); | ||
310 | } | 274 | } |
311 | cleanup: | 275 | cleanup: |
312 | posix_acl_release(acl); | 276 | posix_acl_release(acl); |
@@ -330,9 +294,12 @@ cleanup: | |||
330 | int | 294 | int |
331 | ext4_acl_chmod(struct inode *inode) | 295 | ext4_acl_chmod(struct inode *inode) |
332 | { | 296 | { |
333 | struct posix_acl *acl, *clone; | 297 | struct posix_acl *acl; |
298 | handle_t *handle; | ||
299 | int retries = 0; | ||
334 | int error; | 300 | int error; |
335 | 301 | ||
302 | |||
336 | if (S_ISLNK(inode->i_mode)) | 303 | if (S_ISLNK(inode->i_mode)) |
337 | return -EOPNOTSUPP; | 304 | return -EOPNOTSUPP; |
338 | if (!test_opt(inode->i_sb, POSIX_ACL)) | 305 | if (!test_opt(inode->i_sb, POSIX_ACL)) |
@@ -340,31 +307,24 @@ ext4_acl_chmod(struct inode *inode) | |||
340 | acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); | 307 | acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); |
341 | if (IS_ERR(acl) || !acl) | 308 | if (IS_ERR(acl) || !acl) |
342 | return PTR_ERR(acl); | 309 | return PTR_ERR(acl); |
343 | clone = posix_acl_clone(acl, GFP_KERNEL); | 310 | error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); |
344 | posix_acl_release(acl); | 311 | if (error) |
345 | if (!clone) | 312 | return error; |
346 | return -ENOMEM; | 313 | retry: |
347 | error = posix_acl_chmod_masq(clone, inode->i_mode); | 314 | handle = ext4_journal_start(inode, |
348 | if (!error) { | 315 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); |
349 | handle_t *handle; | 316 | if (IS_ERR(handle)) { |
350 | int retries = 0; | 317 | error = PTR_ERR(handle); |
351 | 318 | ext4_std_error(inode->i_sb, error); | |
352 | retry: | 319 | goto out; |
353 | handle = ext4_journal_start(inode, | ||
354 | EXT4_DATA_TRANS_BLOCKS(inode->i_sb)); | ||
355 | if (IS_ERR(handle)) { | ||
356 | error = PTR_ERR(handle); | ||
357 | ext4_std_error(inode->i_sb, error); | ||
358 | goto out; | ||
359 | } | ||
360 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, clone); | ||
361 | ext4_journal_stop(handle); | ||
362 | if (error == -ENOSPC && | ||
363 | ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
364 | goto retry; | ||
365 | } | 320 | } |
321 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); | ||
322 | ext4_journal_stop(handle); | ||
323 | if (error == -ENOSPC && | ||
324 | ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
325 | goto retry; | ||
366 | out: | 326 | out: |
367 | posix_acl_release(clone); | 327 | posix_acl_release(acl); |
368 | return error; | 328 | return error; |
369 | } | 329 | } |
370 | 330 | ||
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index dec821168fd..18cb39ed7c7 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h | |||
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) | |||
54 | #ifdef CONFIG_EXT4_FS_POSIX_ACL | 54 | #ifdef CONFIG_EXT4_FS_POSIX_ACL |
55 | 55 | ||
56 | /* acl.c */ | 56 | /* acl.c */ |
57 | extern int ext4_check_acl(struct inode *, int, unsigned int); | 57 | struct posix_acl *ext4_get_acl(struct inode *inode, int type); |
58 | extern int ext4_acl_chmod(struct inode *); | 58 | extern int ext4_acl_chmod(struct inode *); |
59 | extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); | 59 | extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); |
60 | 60 | ||
61 | #else /* CONFIG_EXT4_FS_POSIX_ACL */ | 61 | #else /* CONFIG_EXT4_FS_POSIX_ACL */ |
62 | #include <linux/sched.h> | 62 | #include <linux/sched.h> |
63 | #define ext4_check_acl NULL | 63 | #define ext4_get_acl NULL |
64 | 64 | ||
65 | static inline int | 65 | static inline int |
66 | ext4_acl_chmod(struct inode *inode) | 66 | ext4_acl_chmod(struct inode *inode) |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511..f8224adf496 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) | |||
620 | 620 | ||
621 | } | 621 | } |
622 | 622 | ||
623 | /** | ||
624 | * ext4_inode_to_goal_block - return a hint for block allocation | ||
625 | * @inode: inode for block allocation | ||
626 | * | ||
627 | * Return the ideal location to start allocating blocks for a | ||
628 | * newly created inode. | ||
629 | */ | ||
630 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) | ||
631 | { | ||
632 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
633 | ext4_group_t block_group; | ||
634 | ext4_grpblk_t colour; | ||
635 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
636 | ext4_fsblk_t bg_start; | ||
637 | ext4_fsblk_t last_block; | ||
638 | |||
639 | block_group = ei->i_block_group; | ||
640 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
641 | /* | ||
642 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
643 | * block groups per flexgroup, reserve the first block | ||
644 | * group for directories and special files. Regular | ||
645 | * files will start at the second block group. This | ||
646 | * tends to speed up directory access and improves | ||
647 | * fsck times. | ||
648 | */ | ||
649 | block_group &= ~(flex_size-1); | ||
650 | if (S_ISREG(inode->i_mode)) | ||
651 | block_group++; | ||
652 | } | ||
653 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
654 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
655 | |||
656 | /* | ||
657 | * If we are doing delayed allocation, we don't need take | ||
658 | * colour into account. | ||
659 | */ | ||
660 | if (test_opt(inode->i_sb, DELALLOC)) | ||
661 | return bg_start; | ||
662 | |||
663 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
664 | colour = (current->pid % 16) * | ||
665 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
666 | else | ||
667 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
668 | return bg_start + colour; | ||
669 | } | ||
670 | |||
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba8..8efb2f0a344 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c | |||
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, | |||
246 | return 1; | 246 | return 1; |
247 | } | 247 | } |
248 | 248 | ||
249 | int ext4_check_blockref(const char *function, unsigned int line, | ||
250 | struct inode *inode, __le32 *p, unsigned int max) | ||
251 | { | ||
252 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
253 | __le32 *bref = p; | ||
254 | unsigned int blk; | ||
255 | |||
256 | while (bref < p+max) { | ||
257 | blk = le32_to_cpu(*bref++); | ||
258 | if (blk && | ||
259 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
260 | blk, 1))) { | ||
261 | es->s_last_error_block = cpu_to_le64(blk); | ||
262 | ext4_error_inode(inode, function, line, blk, | ||
263 | "invalid block"); | ||
264 | return -EIO; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1921392cd70..5c38120c389 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -175,6 +175,7 @@ struct mpage_da_data { | |||
175 | */ | 175 | */ |
176 | #define EXT4_IO_END_UNWRITTEN 0x0001 | 176 | #define EXT4_IO_END_UNWRITTEN 0x0001 |
177 | #define EXT4_IO_END_ERROR 0x0002 | 177 | #define EXT4_IO_END_ERROR 0x0002 |
178 | #define EXT4_IO_END_QUEUED 0x0004 | ||
178 | 179 | ||
179 | struct ext4_io_page { | 180 | struct ext4_io_page { |
180 | struct page *p_page; | 181 | struct page *p_page; |
@@ -357,8 +358,7 @@ struct flex_groups { | |||
357 | 358 | ||
358 | /* Flags that should be inherited by new inodes from their parent. */ | 359 | /* Flags that should be inherited by new inodes from their parent. */ |
359 | #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ | 360 | #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ |
360 | EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\ | 361 | EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ |
361 | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ | ||
362 | EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ | 362 | EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ |
363 | EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) | 363 | EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) |
364 | 364 | ||
@@ -526,6 +526,7 @@ struct ext4_new_group_data { | |||
526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 | 526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 |
527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 | 527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 |
528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 | 528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 |
529 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 | ||
529 | 530 | ||
530 | /* | 531 | /* |
531 | * ioctl commands | 532 | * ioctl commands |
@@ -939,6 +940,8 @@ struct ext4_inode_info { | |||
939 | #define ext4_find_next_zero_bit find_next_zero_bit_le | 940 | #define ext4_find_next_zero_bit find_next_zero_bit_le |
940 | #define ext4_find_next_bit find_next_bit_le | 941 | #define ext4_find_next_bit find_next_bit_le |
941 | 942 | ||
943 | extern void ext4_set_bits(void *bm, int cur, int len); | ||
944 | |||
942 | /* | 945 | /* |
943 | * Maximal mount counts between two filesystem checks | 946 | * Maximal mount counts between two filesystem checks |
944 | */ | 947 | */ |
@@ -1126,7 +1129,8 @@ struct ext4_sb_info { | |||
1126 | struct journal_s *s_journal; | 1129 | struct journal_s *s_journal; |
1127 | struct list_head s_orphan; | 1130 | struct list_head s_orphan; |
1128 | struct mutex s_orphan_lock; | 1131 | struct mutex s_orphan_lock; |
1129 | struct mutex s_resize_lock; | 1132 | unsigned long s_resize_flags; /* Flags indicating if there |
1133 | is a resizer */ | ||
1130 | unsigned long s_commit_interval; | 1134 | unsigned long s_commit_interval; |
1131 | u32 s_max_batch_time; | 1135 | u32 s_max_batch_time; |
1132 | u32 s_min_batch_time; | 1136 | u32 s_min_batch_time; |
@@ -1214,6 +1218,9 @@ struct ext4_sb_info { | |||
1214 | 1218 | ||
1215 | /* Kernel thread for multiple mount protection */ | 1219 | /* Kernel thread for multiple mount protection */ |
1216 | struct task_struct *s_mmp_tsk; | 1220 | struct task_struct *s_mmp_tsk; |
1221 | |||
1222 | /* record the last minlen when FITRIM is called. */ | ||
1223 | atomic_t s_last_trim_minblks; | ||
1217 | }; | 1224 | }; |
1218 | 1225 | ||
1219 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1226 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
@@ -1743,6 +1750,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, | |||
1743 | struct ext4_group_desc *desc); | 1750 | struct ext4_group_desc *desc); |
1744 | #define ext4_free_blocks_after_init(sb, group, desc) \ | 1751 | #define ext4_free_blocks_after_init(sb, group, desc) \ |
1745 | ext4_init_block_bitmap(sb, NULL, group, desc) | 1752 | ext4_init_block_bitmap(sb, NULL, group, desc) |
1753 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); | ||
1746 | 1754 | ||
1747 | /* dir.c */ | 1755 | /* dir.c */ |
1748 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, | 1756 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, |
@@ -1758,7 +1766,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, | |||
1758 | extern void ext4_htree_free_dir_info(struct dir_private_info *p); | 1766 | extern void ext4_htree_free_dir_info(struct dir_private_info *p); |
1759 | 1767 | ||
1760 | /* fsync.c */ | 1768 | /* fsync.c */ |
1761 | extern int ext4_sync_file(struct file *, int); | 1769 | extern int ext4_sync_file(struct file *, loff_t, loff_t, int); |
1762 | extern int ext4_flush_completed_IO(struct inode *); | 1770 | extern int ext4_flush_completed_IO(struct inode *); |
1763 | 1771 | ||
1764 | /* hash.c */ | 1772 | /* hash.c */ |
@@ -1793,7 +1801,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
1793 | unsigned long count, int flags); | 1801 | unsigned long count, int flags); |
1794 | extern int ext4_mb_add_groupinfo(struct super_block *sb, | 1802 | extern int ext4_mb_add_groupinfo(struct super_block *sb, |
1795 | ext4_group_t i, struct ext4_group_desc *desc); | 1803 | ext4_group_t i, struct ext4_group_desc *desc); |
1796 | extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 1804 | extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
1797 | ext4_fsblk_t block, unsigned long count); | 1805 | ext4_fsblk_t block, unsigned long count); |
1798 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); | 1806 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); |
1799 | 1807 | ||
@@ -1834,6 +1842,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | |||
1834 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 1842 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
1835 | extern void ext4_da_update_reserve_space(struct inode *inode, | 1843 | extern void ext4_da_update_reserve_space(struct inode *inode, |
1836 | int used, int quota_claim); | 1844 | int used, int quota_claim); |
1845 | |||
1846 | /* indirect.c */ | ||
1847 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
1848 | struct ext4_map_blocks *map, int flags); | ||
1849 | extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
1850 | const struct iovec *iov, loff_t offset, | ||
1851 | unsigned long nr_segs); | ||
1852 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | ||
1853 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); | ||
1854 | extern void ext4_ind_truncate(struct inode *inode); | ||
1855 | |||
1837 | /* ioctl.c */ | 1856 | /* ioctl.c */ |
1838 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 1857 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
1839 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | 1858 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
@@ -1855,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, | |||
1855 | ext4_fsblk_t n_blocks_count); | 1874 | ext4_fsblk_t n_blocks_count); |
1856 | 1875 | ||
1857 | /* super.c */ | 1876 | /* super.c */ |
1877 | extern void *ext4_kvmalloc(size_t size, gfp_t flags); | ||
1878 | extern void *ext4_kvzalloc(size_t size, gfp_t flags); | ||
1879 | extern void ext4_kvfree(void *ptr); | ||
1858 | extern void __ext4_error(struct super_block *, const char *, unsigned int, | 1880 | extern void __ext4_error(struct super_block *, const char *, unsigned int, |
1859 | const char *, ...) | 1881 | const char *, ...) |
1860 | __attribute__ ((format (printf, 4, 5))); | 1882 | __attribute__ ((format (printf, 4, 5))); |
@@ -2067,11 +2089,19 @@ struct ext4_group_info { | |||
2067 | * 5 free 8-block regions. */ | 2089 | * 5 free 8-block regions. */ |
2068 | }; | 2090 | }; |
2069 | 2091 | ||
2070 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | 2092 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 |
2093 | #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 | ||
2071 | 2094 | ||
2072 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | 2095 | #define EXT4_MB_GRP_NEED_INIT(grp) \ |
2073 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | 2096 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) |
2074 | 2097 | ||
2098 | #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ | ||
2099 | (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2100 | #define EXT4_MB_GRP_SET_TRIMMED(grp) \ | ||
2101 | (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2102 | #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ | ||
2103 | (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2104 | |||
2075 | #define EXT4_MAX_CONTENTION 8 | 2105 | #define EXT4_MAX_CONTENTION 8 |
2076 | #define EXT4_CONTENTION_THRESHOLD 2 | 2106 | #define EXT4_CONTENTION_THRESHOLD 2 |
2077 | 2107 | ||
@@ -2123,6 +2153,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) | |||
2123 | } | 2153 | } |
2124 | 2154 | ||
2125 | /* | 2155 | /* |
2156 | * Block validity checking | ||
2157 | */ | ||
2158 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
2159 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2160 | (__le32 *)(bh)->b_data, \ | ||
2161 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
2162 | |||
2163 | #define ext4_ind_check_inode(inode) \ | ||
2164 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2165 | EXT4_I(inode)->i_data, \ | ||
2166 | EXT4_NDIR_BLOCKS) | ||
2167 | |||
2168 | /* | ||
2126 | * Inodes and files operations | 2169 | * Inodes and files operations |
2127 | */ | 2170 | */ |
2128 | 2171 | ||
@@ -2151,6 +2194,8 @@ extern void ext4_exit_system_zone(void); | |||
2151 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, | 2194 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, |
2152 | ext4_fsblk_t start_blk, | 2195 | ext4_fsblk_t start_blk, |
2153 | unsigned int count); | 2196 | unsigned int count); |
2197 | extern int ext4_check_blockref(const char *, unsigned int, | ||
2198 | struct inode *, __le32 *, unsigned int); | ||
2154 | 2199 | ||
2155 | /* extents.c */ | 2200 | /* extents.c */ |
2156 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | 2201 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); |
@@ -2230,6 +2275,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) | |||
2230 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; | 2275 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; |
2231 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; | 2276 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; |
2232 | 2277 | ||
2278 | #define EXT4_RESIZING 0 | ||
2279 | extern int ext4_resize_begin(struct super_block *sb); | ||
2280 | extern void ext4_resize_end(struct super_block *sb); | ||
2281 | |||
2233 | #endif /* __KERNEL__ */ | 2282 | #endif /* __KERNEL__ */ |
2234 | 2283 | ||
2235 | #endif /* _EXT4_H */ | 2284 | #endif /* _EXT4_H */ |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index bb85757689b..5802fa1dab1 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode) | |||
289 | 289 | ||
290 | static inline int ext4_should_writeback_data(struct inode *inode) | 290 | static inline int ext4_should_writeback_data(struct inode *inode) |
291 | { | 291 | { |
292 | if (!S_ISREG(inode->i_mode)) | ||
293 | return 0; | ||
294 | if (EXT4_JOURNAL(inode) == NULL) | 292 | if (EXT4_JOURNAL(inode) == NULL) |
295 | return 1; | 293 | return 1; |
294 | if (!S_ISREG(inode->i_mode)) | ||
295 | return 0; | ||
296 | if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) | 296 | if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) |
297 | return 0; | 297 | return 0; |
298 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) | 298 | if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a..57cf568a98a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
114 | struct ext4_ext_path *path, | 114 | struct ext4_ext_path *path, |
115 | ext4_lblk_t block) | 115 | ext4_lblk_t block) |
116 | { | 116 | { |
117 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
118 | ext4_fsblk_t bg_start; | ||
119 | ext4_fsblk_t last_block; | ||
120 | ext4_grpblk_t colour; | ||
121 | ext4_group_t block_group; | ||
122 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
123 | int depth; | 117 | int depth; |
124 | 118 | ||
125 | if (path) { | 119 | if (path) { |
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
161 | } | 155 | } |
162 | 156 | ||
163 | /* OK. use inode's group */ | 157 | /* OK. use inode's group */ |
164 | block_group = ei->i_block_group; | 158 | return ext4_inode_to_goal_block(inode); |
165 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
166 | /* | ||
167 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
168 | * block groups per flexgroup, reserve the first block | ||
169 | * group for directories and special files. Regular | ||
170 | * files will start at the second block group. This | ||
171 | * tends to speed up directory access and improves | ||
172 | * fsck times. | ||
173 | */ | ||
174 | block_group &= ~(flex_size-1); | ||
175 | if (S_ISREG(inode->i_mode)) | ||
176 | block_group++; | ||
177 | } | ||
178 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
179 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
180 | |||
181 | /* | ||
182 | * If we are doing delayed allocation, we don't need take | ||
183 | * colour into account. | ||
184 | */ | ||
185 | if (test_opt(inode->i_sb, DELALLOC)) | ||
186 | return bg_start; | ||
187 | |||
188 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
189 | colour = (current->pid % 16) * | ||
190 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
191 | else | ||
192 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
193 | return bg_start + colour + block; | ||
194 | } | 159 | } |
195 | 160 | ||
196 | /* | 161 | /* |
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
776 | logical, le32_to_cpu(curp->p_idx->ei_block)); | 741 | logical, le32_to_cpu(curp->p_idx->ei_block)); |
777 | return -EIO; | 742 | return -EIO; |
778 | } | 743 | } |
744 | |||
745 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
746 | >= le16_to_cpu(curp->p_hdr->eh_max))) { | ||
747 | EXT4_ERROR_INODE(inode, | ||
748 | "eh_entries %d >= eh_max %d!", | ||
749 | le16_to_cpu(curp->p_hdr->eh_entries), | ||
750 | le16_to_cpu(curp->p_hdr->eh_max)); | ||
751 | return -EIO; | ||
752 | } | ||
753 | |||
779 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; | 754 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; |
780 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { | 755 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { |
781 | /* insert after */ | 756 | /* insert after */ |
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
805 | ext4_idx_store_pblock(ix, ptr); | 780 | ext4_idx_store_pblock(ix, ptr); |
806 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); | 781 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); |
807 | 782 | ||
808 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
809 | > le16_to_cpu(curp->p_hdr->eh_max))) { | ||
810 | EXT4_ERROR_INODE(inode, | ||
811 | "logical %d == ei_block %d!", | ||
812 | logical, le32_to_cpu(curp->p_idx->ei_block)); | ||
813 | return -EIO; | ||
814 | } | ||
815 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { | 783 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { |
816 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); | 784 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); |
817 | return -EIO; | 785 | return -EIO; |
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) | |||
1446 | * ext4_ext_next_leaf_block: | 1414 | * ext4_ext_next_leaf_block: |
1447 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS | 1415 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS |
1448 | */ | 1416 | */ |
1449 | static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, | 1417 | static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) |
1450 | struct ext4_ext_path *path) | ||
1451 | { | 1418 | { |
1452 | int depth; | 1419 | int depth; |
1453 | 1420 | ||
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1757 | goto merge; | 1724 | goto merge; |
1758 | } | 1725 | } |
1759 | 1726 | ||
1760 | repeat: | ||
1761 | depth = ext_depth(inode); | 1727 | depth = ext_depth(inode); |
1762 | eh = path[depth].p_hdr; | 1728 | eh = path[depth].p_hdr; |
1763 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) | 1729 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) |
@@ -1765,9 +1731,10 @@ repeat: | |||
1765 | 1731 | ||
1766 | /* probably next leaf has space for us? */ | 1732 | /* probably next leaf has space for us? */ |
1767 | fex = EXT_LAST_EXTENT(eh); | 1733 | fex = EXT_LAST_EXTENT(eh); |
1768 | next = ext4_ext_next_leaf_block(inode, path); | 1734 | next = EXT_MAX_BLOCKS; |
1769 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) | 1735 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) |
1770 | && next != EXT_MAX_BLOCKS) { | 1736 | next = ext4_ext_next_leaf_block(path); |
1737 | if (next != EXT_MAX_BLOCKS) { | ||
1771 | ext_debug("next leaf block - %d\n", next); | 1738 | ext_debug("next leaf block - %d\n", next); |
1772 | BUG_ON(npath != NULL); | 1739 | BUG_ON(npath != NULL); |
1773 | npath = ext4_ext_find_extent(inode, next, NULL); | 1740 | npath = ext4_ext_find_extent(inode, next, NULL); |
@@ -1779,7 +1746,7 @@ repeat: | |||
1779 | ext_debug("next leaf isn't full(%d)\n", | 1746 | ext_debug("next leaf isn't full(%d)\n", |
1780 | le16_to_cpu(eh->eh_entries)); | 1747 | le16_to_cpu(eh->eh_entries)); |
1781 | path = npath; | 1748 | path = npath; |
1782 | goto repeat; | 1749 | goto has_space; |
1783 | } | 1750 | } |
1784 | ext_debug("next leaf has no free space(%d,%d)\n", | 1751 | ext_debug("next leaf has no free space(%d,%d)\n", |
1785 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); | 1752 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); |
@@ -1839,7 +1806,7 @@ has_space: | |||
1839 | ext4_ext_pblock(newext), | 1806 | ext4_ext_pblock(newext), |
1840 | ext4_ext_is_uninitialized(newext), | 1807 | ext4_ext_is_uninitialized(newext), |
1841 | ext4_ext_get_actual_len(newext), | 1808 | ext4_ext_get_actual_len(newext), |
1842 | nearex, len, nearex + 1, nearex + 2); | 1809 | nearex, len, nearex, nearex + 1); |
1843 | memmove(nearex + 1, nearex, len); | 1810 | memmove(nearex + 1, nearex, len); |
1844 | path[depth].p_ext = nearex; | 1811 | path[depth].p_ext = nearex; |
1845 | } | 1812 | } |
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | |||
2052 | } | 2019 | } |
2053 | 2020 | ||
2054 | /* | 2021 | /* |
2055 | * ext4_ext_in_cache() | 2022 | * ext4_ext_check_cache() |
2056 | * Checks to see if the given block is in the cache. | 2023 | * Checks to see if the given block is in the cache. |
2057 | * If it is, the cached extent is stored in the given | 2024 | * If it is, the cached extent is stored in the given |
2058 | * cache extent pointer. If the cached extent is a hole, | 2025 | * cache extent pointer. If the cached extent is a hole, |
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, | |||
2134 | /* | 2101 | /* |
2135 | * ext4_ext_rm_idx: | 2102 | * ext4_ext_rm_idx: |
2136 | * removes index from the index block. | 2103 | * removes index from the index block. |
2137 | * It's used in truncate case only, thus all requests are for | ||
2138 | * last index in the block only. | ||
2139 | */ | 2104 | */ |
2140 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | 2105 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, |
2141 | struct ext4_ext_path *path) | 2106 | struct ext4_ext_path *path) |
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
2153 | err = ext4_ext_get_access(handle, inode, path); | 2118 | err = ext4_ext_get_access(handle, inode, path); |
2154 | if (err) | 2119 | if (err) |
2155 | return err; | 2120 | return err; |
2121 | |||
2122 | if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { | ||
2123 | int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; | ||
2124 | len *= sizeof(struct ext4_extent_idx); | ||
2125 | memmove(path->p_idx, path->p_idx + 1, len); | ||
2126 | } | ||
2127 | |||
2156 | le16_add_cpu(&path->p_hdr->eh_entries, -1); | 2128 | le16_add_cpu(&path->p_hdr->eh_entries, -1); |
2157 | err = ext4_ext_dirty(handle, inode, path); | 2129 | err = ext4_ext_dirty(handle, inode, path); |
2158 | if (err) | 2130 | if (err) |
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) | |||
2534 | return 1; | 2506 | return 1; |
2535 | } | 2507 | } |
2536 | 2508 | ||
2537 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | 2509 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) |
2538 | ext4_lblk_t end) | ||
2539 | { | 2510 | { |
2540 | struct super_block *sb = inode->i_sb; | 2511 | struct super_block *sb = inode->i_sb; |
2541 | int depth = ext_depth(inode); | 2512 | int depth = ext_depth(inode); |
@@ -2575,7 +2546,7 @@ again: | |||
2575 | if (i == depth) { | 2546 | if (i == depth) { |
2576 | /* this is leaf block */ | 2547 | /* this is leaf block */ |
2577 | err = ext4_ext_rm_leaf(handle, inode, path, | 2548 | err = ext4_ext_rm_leaf(handle, inode, path, |
2578 | start, end); | 2549 | start, EXT_MAX_BLOCKS - 1); |
2579 | /* root level has p_bh == NULL, brelse() eats this */ | 2550 | /* root level has p_bh == NULL, brelse() eats this */ |
2580 | brelse(path[i].p_bh); | 2551 | brelse(path[i].p_bh); |
2581 | path[i].p_bh = NULL; | 2552 | path[i].p_bh = NULL; |
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, | |||
3107 | struct ext4_ext_path *path) | 3078 | struct ext4_ext_path *path) |
3108 | { | 3079 | { |
3109 | struct ext4_extent *ex; | 3080 | struct ext4_extent *ex; |
3110 | struct ext4_extent_header *eh; | ||
3111 | int depth; | 3081 | int depth; |
3112 | int err = 0; | 3082 | int err = 0; |
3113 | 3083 | ||
3114 | depth = ext_depth(inode); | 3084 | depth = ext_depth(inode); |
3115 | eh = path[depth].p_hdr; | ||
3116 | ex = path[depth].p_ext; | 3085 | ex = path[depth].p_ext; |
3117 | 3086 | ||
3118 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" | 3087 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" |
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3357 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | 3326 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); |
3358 | 3327 | ||
3359 | /* check in cache */ | 3328 | /* check in cache */ |
3360 | if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && | 3329 | if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && |
3361 | ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { | 3330 | ext4_ext_in_cache(inode, map->m_lblk, &newex)) { |
3362 | if (!newex.ee_start_lo && !newex.ee_start_hi) { | 3331 | if (!newex.ee_start_lo && !newex.ee_start_hi) { |
3363 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | 3332 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
3364 | /* | 3333 | /* |
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3497 | 3466 | ||
3498 | ext4_ext_mark_uninitialized(ex); | 3467 | ext4_ext_mark_uninitialized(ex); |
3499 | 3468 | ||
3500 | err = ext4_ext_remove_space(inode, map->m_lblk, | 3469 | ext4_ext_invalidate_cache(inode); |
3501 | map->m_lblk + punched_out); | 3470 | |
3471 | err = ext4_ext_rm_leaf(handle, inode, path, | ||
3472 | map->m_lblk, map->m_lblk + punched_out); | ||
3473 | |||
3474 | if (!err && path->p_hdr->eh_entries == 0) { | ||
3475 | /* | ||
3476 | * Punch hole freed all of this sub tree, | ||
3477 | * so we need to correct eh_depth | ||
3478 | */ | ||
3479 | err = ext4_ext_get_access(handle, inode, path); | ||
3480 | if (err == 0) { | ||
3481 | ext_inode_hdr(inode)->eh_depth = 0; | ||
3482 | ext_inode_hdr(inode)->eh_max = | ||
3483 | cpu_to_le16(ext4_ext_space_root( | ||
3484 | inode, 0)); | ||
3485 | |||
3486 | err = ext4_ext_dirty( | ||
3487 | handle, inode, path); | ||
3488 | } | ||
3489 | } | ||
3502 | 3490 | ||
3503 | goto out2; | 3491 | goto out2; |
3504 | } | 3492 | } |
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3596 | } | 3584 | } |
3597 | 3585 | ||
3598 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); | 3586 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); |
3599 | if (err) | 3587 | if (!err) |
3600 | goto out2; | 3588 | err = ext4_ext_insert_extent(handle, inode, path, |
3601 | 3589 | &newex, flags); | |
3602 | err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | ||
3603 | if (err) { | 3590 | if (err) { |
3591 | int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? | ||
3592 | EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; | ||
3604 | /* free data blocks we just allocated */ | 3593 | /* free data blocks we just allocated */ |
3605 | /* not a good idea to call discard here directly, | 3594 | /* not a good idea to call discard here directly, |
3606 | * but otherwise we'd need to call it every free() */ | 3595 | * but otherwise we'd need to call it every free() */ |
3607 | ext4_discard_preallocations(inode); | 3596 | ext4_discard_preallocations(inode); |
3608 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), | 3597 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), |
3609 | ext4_ext_get_actual_len(&newex), 0); | 3598 | ext4_ext_get_actual_len(&newex), fb_flags); |
3610 | goto out2; | 3599 | goto out2; |
3611 | } | 3600 | } |
3612 | 3601 | ||
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
3699 | 3688 | ||
3700 | last_block = (inode->i_size + sb->s_blocksize - 1) | 3689 | last_block = (inode->i_size + sb->s_blocksize - 1) |
3701 | >> EXT4_BLOCK_SIZE_BITS(sb); | 3690 | >> EXT4_BLOCK_SIZE_BITS(sb); |
3702 | err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); | 3691 | err = ext4_ext_remove_space(inode, last_block); |
3703 | 3692 | ||
3704 | /* In a multi-transaction truncate, we only make the final | 3693 | /* In a multi-transaction truncate, we only make the final |
3705 | * transaction synchronous. | 3694 | * transaction synchronous. |
@@ -3835,7 +3824,7 @@ retry: | |||
3835 | blkbits) >> blkbits)) | 3824 | blkbits) >> blkbits)) |
3836 | new_size = offset + len; | 3825 | new_size = offset + len; |
3837 | else | 3826 | else |
3838 | new_size = (map.m_lblk + ret) << blkbits; | 3827 | new_size = ((loff_t) map.m_lblk + ret) << blkbits; |
3839 | 3828 | ||
3840 | ext4_falloc_update_inode(inode, mode, new_size, | 3829 | ext4_falloc_update_inode(inode, mode, new_size, |
3841 | (map.m_flags & EXT4_MAP_NEW)); | 3830 | (map.m_flags & EXT4_MAP_NEW)); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2c097232200..e4095e988eb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -236,6 +236,27 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin) | |||
236 | } | 236 | } |
237 | offset += file->f_pos; | 237 | offset += file->f_pos; |
238 | break; | 238 | break; |
239 | case SEEK_DATA: | ||
240 | /* | ||
241 | * In the generic case the entire file is data, so as long as | ||
242 | * offset isn't at the end of the file then the offset is data. | ||
243 | */ | ||
244 | if (offset >= inode->i_size) { | ||
245 | mutex_unlock(&inode->i_mutex); | ||
246 | return -ENXIO; | ||
247 | } | ||
248 | break; | ||
249 | case SEEK_HOLE: | ||
250 | /* | ||
251 | * There is a virtual hole at the end of the file, so as long as | ||
252 | * offset isn't i_size or larger, return i_size. | ||
253 | */ | ||
254 | if (offset >= inode->i_size) { | ||
255 | mutex_unlock(&inode->i_mutex); | ||
256 | return -ENXIO; | ||
257 | } | ||
258 | offset = inode->i_size; | ||
259 | break; | ||
239 | } | 260 | } |
240 | 261 | ||
241 | if (offset < 0 || offset > maxbytes) { | 262 | if (offset < 0 || offset > maxbytes) { |
@@ -280,7 +301,7 @@ const struct inode_operations ext4_file_inode_operations = { | |||
280 | .listxattr = ext4_listxattr, | 301 | .listxattr = ext4_listxattr, |
281 | .removexattr = generic_removexattr, | 302 | .removexattr = generic_removexattr, |
282 | #endif | 303 | #endif |
283 | .check_acl = ext4_check_acl, | 304 | .get_acl = ext4_get_acl, |
284 | .fiemap = ext4_fiemap, | 305 | .fiemap = ext4_fiemap, |
285 | }; | 306 | }; |
286 | 307 | ||
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index ce66d2fe826..036f78f7a1e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) | |||
129 | { | 129 | { |
130 | struct writeback_control wbc; | 130 | struct writeback_control wbc; |
131 | struct dentry *dentry = NULL; | 131 | struct dentry *dentry = NULL; |
132 | struct inode *next; | ||
132 | int ret = 0; | 133 | int ret = 0; |
133 | 134 | ||
134 | while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | 135 | if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) |
136 | return 0; | ||
137 | inode = igrab(inode); | ||
138 | while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | ||
135 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); | 139 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); |
136 | dentry = list_entry(inode->i_dentry.next, | 140 | dentry = NULL; |
137 | struct dentry, d_alias); | 141 | spin_lock(&inode->i_lock); |
138 | if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) | 142 | if (!list_empty(&inode->i_dentry)) { |
143 | dentry = list_first_entry(&inode->i_dentry, | ||
144 | struct dentry, d_alias); | ||
145 | dget(dentry); | ||
146 | } | ||
147 | spin_unlock(&inode->i_lock); | ||
148 | if (!dentry) | ||
149 | break; | ||
150 | next = igrab(dentry->d_parent->d_inode); | ||
151 | dput(dentry); | ||
152 | if (!next) | ||
139 | break; | 153 | break; |
140 | inode = dentry->d_parent->d_inode; | 154 | iput(inode); |
155 | inode = next; | ||
141 | ret = sync_mapping_buffers(inode->i_mapping); | 156 | ret = sync_mapping_buffers(inode->i_mapping); |
142 | if (ret) | 157 | if (ret) |
143 | break; | 158 | break; |
@@ -148,6 +163,33 @@ static int ext4_sync_parent(struct inode *inode) | |||
148 | if (ret) | 163 | if (ret) |
149 | break; | 164 | break; |
150 | } | 165 | } |
166 | iput(inode); | ||
167 | return ret; | ||
168 | } | ||
169 | |||
170 | /** | ||
171 | * __sync_file - generic_file_fsync without the locking and filemap_write | ||
172 | * @inode: inode to sync | ||
173 | * @datasync: only sync essential metadata if true | ||
174 | * | ||
175 | * This is just generic_file_fsync without the locking. This is needed for | ||
176 | * nojournal mode to make sure this inodes data/metadata makes it to disk | ||
177 | * properly. The i_mutex should be held already. | ||
178 | */ | ||
179 | static int __sync_inode(struct inode *inode, int datasync) | ||
180 | { | ||
181 | int err; | ||
182 | int ret; | ||
183 | |||
184 | ret = sync_mapping_buffers(inode->i_mapping); | ||
185 | if (!(inode->i_state & I_DIRTY)) | ||
186 | return ret; | ||
187 | if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) | ||
188 | return ret; | ||
189 | |||
190 | err = sync_inode_metadata(inode, 1); | ||
191 | if (ret == 0) | ||
192 | ret = err; | ||
151 | return ret; | 193 | return ret; |
152 | } | 194 | } |
153 | 195 | ||
@@ -165,7 +207,7 @@ static int ext4_sync_parent(struct inode *inode) | |||
165 | * i_mutex lock is held when entering and exiting this function | 207 | * i_mutex lock is held when entering and exiting this function |
166 | */ | 208 | */ |
167 | 209 | ||
168 | int ext4_sync_file(struct file *file, int datasync) | 210 | int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) |
169 | { | 211 | { |
170 | struct inode *inode = file->f_mapping->host; | 212 | struct inode *inode = file->f_mapping->host; |
171 | struct ext4_inode_info *ei = EXT4_I(inode); | 213 | struct ext4_inode_info *ei = EXT4_I(inode); |
@@ -178,15 +220,20 @@ int ext4_sync_file(struct file *file, int datasync) | |||
178 | 220 | ||
179 | trace_ext4_sync_file_enter(file, datasync); | 221 | trace_ext4_sync_file_enter(file, datasync); |
180 | 222 | ||
223 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | ||
224 | if (ret) | ||
225 | return ret; | ||
226 | mutex_lock(&inode->i_mutex); | ||
227 | |||
181 | if (inode->i_sb->s_flags & MS_RDONLY) | 228 | if (inode->i_sb->s_flags & MS_RDONLY) |
182 | return 0; | 229 | goto out; |
183 | 230 | ||
184 | ret = ext4_flush_completed_IO(inode); | 231 | ret = ext4_flush_completed_IO(inode); |
185 | if (ret < 0) | 232 | if (ret < 0) |
186 | goto out; | 233 | goto out; |
187 | 234 | ||
188 | if (!journal) { | 235 | if (!journal) { |
189 | ret = generic_file_fsync(file, datasync); | 236 | ret = __sync_inode(inode, datasync); |
190 | if (!ret && !list_empty(&inode->i_dentry)) | 237 | if (!ret && !list_empty(&inode->i_dentry)) |
191 | ret = ext4_sync_parent(inode); | 238 | ret = ext4_sync_parent(inode); |
192 | goto out; | 239 | goto out; |
@@ -220,6 +267,7 @@ int ext4_sync_file(struct file *file, int datasync) | |||
220 | if (needs_barrier) | 267 | if (needs_barrier) |
221 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); | 268 | blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); |
222 | out: | 269 | out: |
270 | mutex_unlock(&inode->i_mutex); | ||
223 | trace_ext4_sync_file_exit(inode, ret); | 271 | trace_ext4_sync_file_exit(inode, ret); |
224 | return ret; | 272 | return ret; |
225 | } | 273 | } |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f61e50..9c63f273b55 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, | |||
1287 | group, used_blks, | 1287 | group, used_blks, |
1288 | ext4_itable_unused_count(sb, gdp)); | 1288 | ext4_itable_unused_count(sb, gdp)); |
1289 | ret = 1; | 1289 | ret = 1; |
1290 | goto out; | 1290 | goto err_out; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | blk = ext4_inode_table(sb, gdp) + used_blks; | 1293 | blk = ext4_inode_table(sb, gdp) + used_blks; |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 00000000000..0962642119c --- /dev/null +++ b/fs/ext4/indirect.c | |||
@@ -0,0 +1,1487 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/indirect.c | ||
3 | * | ||
4 | * from | ||
5 | * | ||
6 | * linux/fs/ext4/inode.c | ||
7 | * | ||
8 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
9 | * Remy Card (card@masi.ibp.fr) | ||
10 | * Laboratoire MASI - Institut Blaise Pascal | ||
11 | * Universite Pierre et Marie Curie (Paris VI) | ||
12 | * | ||
13 | * from | ||
14 | * | ||
15 | * linux/fs/minix/inode.c | ||
16 | * | ||
17 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
18 | * | ||
19 | * Goal-directed block allocation by Stephen Tweedie | ||
20 | * (sct@redhat.com), 1993, 1998 | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include "ext4_jbd2.h" | ||
25 | #include "truncate.h" | ||
26 | |||
27 | #include <trace/events/ext4.h> | ||
28 | |||
29 | typedef struct { | ||
30 | __le32 *p; | ||
31 | __le32 key; | ||
32 | struct buffer_head *bh; | ||
33 | } Indirect; | ||
34 | |||
35 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
36 | { | ||
37 | p->key = *(p->p = v); | ||
38 | p->bh = bh; | ||
39 | } | ||
40 | |||
41 | /** | ||
42 | * ext4_block_to_path - parse the block number into array of offsets | ||
43 | * @inode: inode in question (we are only interested in its superblock) | ||
44 | * @i_block: block number to be parsed | ||
45 | * @offsets: array to store the offsets in | ||
46 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
47 | * followed (on disk) by an indirect block. | ||
48 | * | ||
49 | * To store the locations of file's data ext4 uses a data structure common | ||
50 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
51 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
52 | * This function translates the block number into path in that tree - | ||
53 | * return value is the path length and @offsets[n] is the offset of | ||
54 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
55 | * (negative or too large) warning is printed and zero returned. | ||
56 | * | ||
57 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
58 | * we need to know is the capacity of indirect blocks (taken from the | ||
59 | * inode->i_sb). | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * Portability note: the last comparison (check that we fit into triple | ||
64 | * indirect block) is spelled differently, because otherwise on an | ||
65 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
66 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
67 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
68 | * i_block would have to be negative in the very beginning, so we would not | ||
69 | * get there at all. | ||
70 | */ | ||
71 | |||
72 | static int ext4_block_to_path(struct inode *inode, | ||
73 | ext4_lblk_t i_block, | ||
74 | ext4_lblk_t offsets[4], int *boundary) | ||
75 | { | ||
76 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
77 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
78 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
79 | indirect_blocks = ptrs, | ||
80 | double_blocks = (1 << (ptrs_bits * 2)); | ||
81 | int n = 0; | ||
82 | int final = 0; | ||
83 | |||
84 | if (i_block < direct_blocks) { | ||
85 | offsets[n++] = i_block; | ||
86 | final = direct_blocks; | ||
87 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
88 | offsets[n++] = EXT4_IND_BLOCK; | ||
89 | offsets[n++] = i_block; | ||
90 | final = ptrs; | ||
91 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
92 | offsets[n++] = EXT4_DIND_BLOCK; | ||
93 | offsets[n++] = i_block >> ptrs_bits; | ||
94 | offsets[n++] = i_block & (ptrs - 1); | ||
95 | final = ptrs; | ||
96 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
97 | offsets[n++] = EXT4_TIND_BLOCK; | ||
98 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
99 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
100 | offsets[n++] = i_block & (ptrs - 1); | ||
101 | final = ptrs; | ||
102 | } else { | ||
103 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
104 | i_block + direct_blocks + | ||
105 | indirect_blocks + double_blocks, inode->i_ino); | ||
106 | } | ||
107 | if (boundary) | ||
108 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
109 | return n; | ||
110 | } | ||
111 | |||
112 | /** | ||
113 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
114 | * @inode: inode in question | ||
115 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
116 | * @offsets: offsets of pointers in inode/indirect blocks | ||
117 | * @chain: place to store the result | ||
118 | * @err: here we store the error value | ||
119 | * | ||
120 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
121 | * if everything went OK or the pointer to the last filled triple | ||
122 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
123 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
124 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
125 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
126 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
127 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
128 | * numbers of the chain, addresses they were taken from (and where we can | ||
129 | * verify that chain did not change) and buffer_heads hosting these | ||
130 | * numbers. | ||
131 | * | ||
132 | * Function stops when it stumbles upon zero pointer (absent block) | ||
133 | * (pointer to last triple returned, *@err == 0) | ||
134 | * or when it gets an IO error reading an indirect block | ||
135 | * (ditto, *@err == -EIO) | ||
136 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
137 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
138 | * | ||
139 | * Need to be called with | ||
140 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
141 | */ | ||
142 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
143 | ext4_lblk_t *offsets, | ||
144 | Indirect chain[4], int *err) | ||
145 | { | ||
146 | struct super_block *sb = inode->i_sb; | ||
147 | Indirect *p = chain; | ||
148 | struct buffer_head *bh; | ||
149 | |||
150 | *err = 0; | ||
151 | /* i_data is not going away, no lock needed */ | ||
152 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
153 | if (!p->key) | ||
154 | goto no_block; | ||
155 | while (--depth) { | ||
156 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
157 | if (unlikely(!bh)) | ||
158 | goto failure; | ||
159 | |||
160 | if (!bh_uptodate_or_lock(bh)) { | ||
161 | if (bh_submit_read(bh) < 0) { | ||
162 | put_bh(bh); | ||
163 | goto failure; | ||
164 | } | ||
165 | /* validate block references */ | ||
166 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
167 | put_bh(bh); | ||
168 | goto failure; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
173 | /* Reader: end */ | ||
174 | if (!p->key) | ||
175 | goto no_block; | ||
176 | } | ||
177 | return NULL; | ||
178 | |||
179 | failure: | ||
180 | *err = -EIO; | ||
181 | no_block: | ||
182 | return p; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * ext4_find_near - find a place for allocation with sufficient locality | ||
187 | * @inode: owner | ||
188 | * @ind: descriptor of indirect block. | ||
189 | * | ||
190 | * This function returns the preferred place for block allocation. | ||
191 | * It is used when heuristic for sequential allocation fails. | ||
192 | * Rules are: | ||
193 | * + if there is a block to the left of our position - allocate near it. | ||
194 | * + if pointer will live in indirect block - allocate near that block. | ||
195 | * + if pointer will live in inode - allocate in the same | ||
196 | * cylinder group. | ||
197 | * | ||
198 | * In the latter case we colour the starting block by the callers PID to | ||
199 | * prevent it from clashing with concurrent allocations for a different inode | ||
200 | * in the same block group. The PID is used here so that functionally related | ||
201 | * files will be close-by on-disk. | ||
202 | * | ||
203 | * Caller must make sure that @ind is valid and will stay that way. | ||
204 | */ | ||
205 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
206 | { | ||
207 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
208 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
209 | __le32 *p; | ||
210 | |||
211 | /* Try to find previous block */ | ||
212 | for (p = ind->p - 1; p >= start; p--) { | ||
213 | if (*p) | ||
214 | return le32_to_cpu(*p); | ||
215 | } | ||
216 | |||
217 | /* No such thing, so let's try location of indirect block */ | ||
218 | if (ind->bh) | ||
219 | return ind->bh->b_blocknr; | ||
220 | |||
221 | /* | ||
222 | * It is going to be referred to from the inode itself? OK, just put it | ||
223 | * into the same cylinder group then. | ||
224 | */ | ||
225 | return ext4_inode_to_goal_block(inode); | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * ext4_find_goal - find a preferred place for allocation. | ||
230 | * @inode: owner | ||
231 | * @block: block we want | ||
232 | * @partial: pointer to the last triple within a chain | ||
233 | * | ||
234 | * Normally this function find the preferred place for block allocation, | ||
235 | * returns it. | ||
236 | * Because this is only used for non-extent files, we limit the block nr | ||
237 | * to 32 bits. | ||
238 | */ | ||
239 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
240 | Indirect *partial) | ||
241 | { | ||
242 | ext4_fsblk_t goal; | ||
243 | |||
244 | /* | ||
245 | * XXX need to get goal block from mballoc's data structures | ||
246 | */ | ||
247 | |||
248 | goal = ext4_find_near(inode, partial); | ||
249 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
250 | return goal; | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
255 | * of direct blocks need to be allocated for the given branch. | ||
256 | * | ||
257 | * @branch: chain of indirect blocks | ||
258 | * @k: number of blocks need for indirect blocks | ||
259 | * @blks: number of data blocks to be mapped. | ||
260 | * @blocks_to_boundary: the offset in the indirect block | ||
261 | * | ||
262 | * return the total number of blocks to be allocate, including the | ||
263 | * direct and indirect blocks. | ||
264 | */ | ||
265 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
266 | int blocks_to_boundary) | ||
267 | { | ||
268 | unsigned int count = 0; | ||
269 | |||
270 | /* | ||
271 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
272 | * then it's clear blocks on that path have not allocated | ||
273 | */ | ||
274 | if (k > 0) { | ||
275 | /* right now we don't handle cross boundary allocation */ | ||
276 | if (blks < blocks_to_boundary + 1) | ||
277 | count += blks; | ||
278 | else | ||
279 | count += blocks_to_boundary + 1; | ||
280 | return count; | ||
281 | } | ||
282 | |||
283 | count++; | ||
284 | while (count < blks && count <= blocks_to_boundary && | ||
285 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
286 | count++; | ||
287 | } | ||
288 | return count; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
293 | * @handle: handle for this transaction | ||
294 | * @inode: inode which needs allocated blocks | ||
295 | * @iblock: the logical block to start allocated at | ||
296 | * @goal: preferred physical block of allocation | ||
297 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
298 | * blocks | ||
299 | * @blks: number of desired blocks | ||
300 | * @new_blocks: on return it will store the new block numbers for | ||
301 | * the indirect blocks(if needed) and the first direct block, | ||
302 | * @err: on return it will store the error code | ||
303 | * | ||
304 | * This function will return the number of blocks allocated as | ||
305 | * requested by the passed-in parameters. | ||
306 | */ | ||
307 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
308 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
309 | int indirect_blks, int blks, | ||
310 | ext4_fsblk_t new_blocks[4], int *err) | ||
311 | { | ||
312 | struct ext4_allocation_request ar; | ||
313 | int target, i; | ||
314 | unsigned long count = 0, blk_allocated = 0; | ||
315 | int index = 0; | ||
316 | ext4_fsblk_t current_block = 0; | ||
317 | int ret = 0; | ||
318 | |||
319 | /* | ||
320 | * Here we try to allocate the requested multiple blocks at once, | ||
321 | * on a best-effort basis. | ||
322 | * To build a branch, we should allocate blocks for | ||
323 | * the indirect blocks(if not allocated yet), and at least | ||
324 | * the first direct block of this branch. That's the | ||
325 | * minimum number of blocks need to allocate(required) | ||
326 | */ | ||
327 | /* first we try to allocate the indirect blocks */ | ||
328 | target = indirect_blks; | ||
329 | while (target > 0) { | ||
330 | count = target; | ||
331 | /* allocating blocks for indirect blocks and direct blocks */ | ||
332 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
333 | 0, &count, err); | ||
334 | if (*err) | ||
335 | goto failed_out; | ||
336 | |||
337 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
338 | EXT4_ERROR_INODE(inode, | ||
339 | "current_block %llu + count %lu > %d!", | ||
340 | current_block, count, | ||
341 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
342 | *err = -EIO; | ||
343 | goto failed_out; | ||
344 | } | ||
345 | |||
346 | target -= count; | ||
347 | /* allocate blocks for indirect blocks */ | ||
348 | while (index < indirect_blks && count) { | ||
349 | new_blocks[index++] = current_block++; | ||
350 | count--; | ||
351 | } | ||
352 | if (count > 0) { | ||
353 | /* | ||
354 | * save the new block number | ||
355 | * for the first direct block | ||
356 | */ | ||
357 | new_blocks[index] = current_block; | ||
358 | printk(KERN_INFO "%s returned more blocks than " | ||
359 | "requested\n", __func__); | ||
360 | WARN_ON(1); | ||
361 | break; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | target = blks - count ; | ||
366 | blk_allocated = count; | ||
367 | if (!target) | ||
368 | goto allocated; | ||
369 | /* Now allocate data blocks */ | ||
370 | memset(&ar, 0, sizeof(ar)); | ||
371 | ar.inode = inode; | ||
372 | ar.goal = goal; | ||
373 | ar.len = target; | ||
374 | ar.logical = iblock; | ||
375 | if (S_ISREG(inode->i_mode)) | ||
376 | /* enable in-core preallocation only for regular files */ | ||
377 | ar.flags = EXT4_MB_HINT_DATA; | ||
378 | |||
379 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
380 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
381 | EXT4_ERROR_INODE(inode, | ||
382 | "current_block %llu + ar.len %d > %d!", | ||
383 | current_block, ar.len, | ||
384 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
385 | *err = -EIO; | ||
386 | goto failed_out; | ||
387 | } | ||
388 | |||
389 | if (*err && (target == blks)) { | ||
390 | /* | ||
391 | * if the allocation failed and we didn't allocate | ||
392 | * any blocks before | ||
393 | */ | ||
394 | goto failed_out; | ||
395 | } | ||
396 | if (!*err) { | ||
397 | if (target == blks) { | ||
398 | /* | ||
399 | * save the new block number | ||
400 | * for the first direct block | ||
401 | */ | ||
402 | new_blocks[index] = current_block; | ||
403 | } | ||
404 | blk_allocated += ar.len; | ||
405 | } | ||
406 | allocated: | ||
407 | /* total number of blocks allocated for direct blocks */ | ||
408 | ret = blk_allocated; | ||
409 | *err = 0; | ||
410 | return ret; | ||
411 | failed_out: | ||
412 | for (i = 0; i < index; i++) | ||
413 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /** | ||
418 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
419 | * @handle: handle for this transaction | ||
420 | * @inode: owner | ||
421 | * @indirect_blks: number of allocated indirect blocks | ||
422 | * @blks: number of allocated direct blocks | ||
423 | * @goal: preferred place for allocation | ||
424 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
425 | * @branch: place to store the chain in. | ||
426 | * | ||
427 | * This function allocates blocks, zeroes out all but the last one, | ||
428 | * links them into chain and (if we are synchronous) writes them to disk. | ||
429 | * In other words, it prepares a branch that can be spliced onto the | ||
430 | * inode. It stores the information about that chain in the branch[], in | ||
431 | * the same format as ext4_get_branch() would do. We are calling it after | ||
432 | * we had read the existing part of chain and partial points to the last | ||
433 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
434 | * picture as after the successful ext4_get_block(), except that in one | ||
435 | * place chain is disconnected - *branch->p is still zero (we did not | ||
436 | * set the last link), but branch->key contains the number that should | ||
437 | * be placed into *branch->p to fill that gap. | ||
438 | * | ||
439 | * If allocation fails we free all blocks we've allocated (and forget | ||
440 | * their buffer_heads) and return the error value the from failed | ||
441 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
442 | * as described above and return 0. | ||
443 | */ | ||
444 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
445 | ext4_lblk_t iblock, int indirect_blks, | ||
446 | int *blks, ext4_fsblk_t goal, | ||
447 | ext4_lblk_t *offsets, Indirect *branch) | ||
448 | { | ||
449 | int blocksize = inode->i_sb->s_blocksize; | ||
450 | int i, n = 0; | ||
451 | int err = 0; | ||
452 | struct buffer_head *bh; | ||
453 | int num; | ||
454 | ext4_fsblk_t new_blocks[4]; | ||
455 | ext4_fsblk_t current_block; | ||
456 | |||
457 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
458 | *blks, new_blocks, &err); | ||
459 | if (err) | ||
460 | return err; | ||
461 | |||
462 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
463 | /* | ||
464 | * metadata blocks and data blocks are allocated. | ||
465 | */ | ||
466 | for (n = 1; n <= indirect_blks; n++) { | ||
467 | /* | ||
468 | * Get buffer_head for parent block, zero it out | ||
469 | * and set the pointer to new one, then send | ||
470 | * parent to disk. | ||
471 | */ | ||
472 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
473 | if (unlikely(!bh)) { | ||
474 | err = -EIO; | ||
475 | goto failed; | ||
476 | } | ||
477 | |||
478 | branch[n].bh = bh; | ||
479 | lock_buffer(bh); | ||
480 | BUFFER_TRACE(bh, "call get_create_access"); | ||
481 | err = ext4_journal_get_create_access(handle, bh); | ||
482 | if (err) { | ||
483 | /* Don't brelse(bh) here; it's done in | ||
484 | * ext4_journal_forget() below */ | ||
485 | unlock_buffer(bh); | ||
486 | goto failed; | ||
487 | } | ||
488 | |||
489 | memset(bh->b_data, 0, blocksize); | ||
490 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
491 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
492 | *branch[n].p = branch[n].key; | ||
493 | if (n == indirect_blks) { | ||
494 | current_block = new_blocks[n]; | ||
495 | /* | ||
496 | * End of chain, update the last new metablock of | ||
497 | * the chain to point to the new allocated | ||
498 | * data blocks numbers | ||
499 | */ | ||
500 | for (i = 1; i < num; i++) | ||
501 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
502 | } | ||
503 | BUFFER_TRACE(bh, "marking uptodate"); | ||
504 | set_buffer_uptodate(bh); | ||
505 | unlock_buffer(bh); | ||
506 | |||
507 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
508 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
509 | if (err) | ||
510 | goto failed; | ||
511 | } | ||
512 | *blks = num; | ||
513 | return err; | ||
514 | failed: | ||
515 | /* Allocation failed, free what we already allocated */ | ||
516 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
517 | for (i = 1; i <= n ; i++) { | ||
518 | /* | ||
519 | * branch[i].bh is newly allocated, so there is no | ||
520 | * need to revoke the block, which is why we don't | ||
521 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
522 | */ | ||
523 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
524 | EXT4_FREE_BLOCKS_FORGET); | ||
525 | } | ||
526 | for (i = n+1; i < indirect_blks; i++) | ||
527 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
528 | |||
529 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
530 | |||
531 | return err; | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
536 | * @handle: handle for this transaction | ||
537 | * @inode: owner | ||
538 | * @block: (logical) number of block we are adding | ||
539 | * @chain: chain of indirect blocks (with a missing link - see | ||
540 | * ext4_alloc_branch) | ||
541 | * @where: location of missing link | ||
542 | * @num: number of indirect blocks we are adding | ||
543 | * @blks: number of direct blocks we are adding | ||
544 | * | ||
545 | * This function fills the missing link and does all housekeeping needed in | ||
546 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
547 | * chain to new block and return 0. | ||
548 | */ | ||
549 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
550 | ext4_lblk_t block, Indirect *where, int num, | ||
551 | int blks) | ||
552 | { | ||
553 | int i; | ||
554 | int err = 0; | ||
555 | ext4_fsblk_t current_block; | ||
556 | |||
557 | /* | ||
558 | * If we're splicing into a [td]indirect block (as opposed to the | ||
559 | * inode) then we need to get write access to the [td]indirect block | ||
560 | * before the splice. | ||
561 | */ | ||
562 | if (where->bh) { | ||
563 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
564 | err = ext4_journal_get_write_access(handle, where->bh); | ||
565 | if (err) | ||
566 | goto err_out; | ||
567 | } | ||
568 | /* That's it */ | ||
569 | |||
570 | *where->p = where->key; | ||
571 | |||
572 | /* | ||
573 | * Update the host buffer_head or inode to point to more just allocated | ||
574 | * direct blocks blocks | ||
575 | */ | ||
576 | if (num == 0 && blks > 1) { | ||
577 | current_block = le32_to_cpu(where->key) + 1; | ||
578 | for (i = 1; i < blks; i++) | ||
579 | *(where->p + i) = cpu_to_le32(current_block++); | ||
580 | } | ||
581 | |||
582 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
583 | /* had we spliced it onto indirect block? */ | ||
584 | if (where->bh) { | ||
585 | /* | ||
586 | * If we spliced it onto an indirect block, we haven't | ||
587 | * altered the inode. Note however that if it is being spliced | ||
588 | * onto an indirect block at the very end of the file (the | ||
589 | * file is growing) then we *will* alter the inode to reflect | ||
590 | * the new i_size. But that is not done here - it is done in | ||
591 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
592 | */ | ||
593 | jbd_debug(5, "splicing indirect only\n"); | ||
594 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
595 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
596 | if (err) | ||
597 | goto err_out; | ||
598 | } else { | ||
599 | /* | ||
600 | * OK, we spliced it into the inode itself on a direct block. | ||
601 | */ | ||
602 | ext4_mark_inode_dirty(handle, inode); | ||
603 | jbd_debug(5, "splicing direct\n"); | ||
604 | } | ||
605 | return err; | ||
606 | |||
607 | err_out: | ||
608 | for (i = 1; i <= num; i++) { | ||
609 | /* | ||
610 | * branch[i].bh is newly allocated, so there is no | ||
611 | * need to revoke the block, which is why we don't | ||
612 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
613 | */ | ||
614 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
615 | EXT4_FREE_BLOCKS_FORGET); | ||
616 | } | ||
617 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
618 | blks, 0); | ||
619 | |||
620 | return err; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
625 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
626 | * scheme) for ext4_map_blocks(). | ||
627 | * | ||
628 | * Allocation strategy is simple: if we have to allocate something, we will | ||
629 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
630 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
631 | * required, recheck the path, free and repeat if check fails, otherwise | ||
632 | * set the last missing link (that will protect us from any truncate-generated | ||
633 | * removals - all blocks on the path are immune now) and possibly force the | ||
634 | * write on the parent block. | ||
635 | * That has a nice additional property: no special recovery from the failed | ||
636 | * allocations is needed - we simply release blocks and do not touch anything | ||
637 | * reachable from inode. | ||
638 | * | ||
639 | * `handle' can be NULL if create == 0. | ||
640 | * | ||
641 | * return > 0, # of blocks mapped or allocated. | ||
642 | * return = 0, if plain lookup failed. | ||
643 | * return < 0, error case. | ||
644 | * | ||
645 | * The ext4_ind_get_blocks() function should be called with | ||
646 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
647 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
648 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
649 | * blocks. | ||
650 | */ | ||
651 | int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
652 | struct ext4_map_blocks *map, | ||
653 | int flags) | ||
654 | { | ||
655 | int err = -EIO; | ||
656 | ext4_lblk_t offsets[4]; | ||
657 | Indirect chain[4]; | ||
658 | Indirect *partial; | ||
659 | ext4_fsblk_t goal; | ||
660 | int indirect_blks; | ||
661 | int blocks_to_boundary = 0; | ||
662 | int depth; | ||
663 | int count = 0; | ||
664 | ext4_fsblk_t first_block = 0; | ||
665 | |||
666 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
667 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
668 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
669 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
670 | &blocks_to_boundary); | ||
671 | |||
672 | if (depth == 0) | ||
673 | goto out; | ||
674 | |||
675 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
676 | |||
677 | /* Simplest case - block found, no allocation needed */ | ||
678 | if (!partial) { | ||
679 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
680 | count++; | ||
681 | /*map more blocks*/ | ||
682 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
683 | ext4_fsblk_t blk; | ||
684 | |||
685 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
686 | |||
687 | if (blk == first_block + count) | ||
688 | count++; | ||
689 | else | ||
690 | break; | ||
691 | } | ||
692 | goto got_it; | ||
693 | } | ||
694 | |||
695 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
696 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
697 | goto cleanup; | ||
698 | |||
699 | /* | ||
700 | * Okay, we need to do block allocation. | ||
701 | */ | ||
702 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
703 | |||
704 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
705 | indirect_blks = (chain + depth) - partial - 1; | ||
706 | |||
707 | /* | ||
708 | * Next look up the indirect map to count the totoal number of | ||
709 | * direct blocks to allocate for this branch. | ||
710 | */ | ||
711 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
712 | map->m_len, blocks_to_boundary); | ||
713 | /* | ||
714 | * Block out ext4_truncate while we alter the tree | ||
715 | */ | ||
716 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
717 | &count, goal, | ||
718 | offsets + (partial - chain), partial); | ||
719 | |||
720 | /* | ||
721 | * The ext4_splice_branch call will free and forget any buffers | ||
722 | * on the new chain if there is a failure, but that risks using | ||
723 | * up transaction credits, especially for bitmaps where the | ||
724 | * credits cannot be returned. Can we handle this somehow? We | ||
725 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
726 | */ | ||
727 | if (!err) | ||
728 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
729 | partial, indirect_blks, count); | ||
730 | if (err) | ||
731 | goto cleanup; | ||
732 | |||
733 | map->m_flags |= EXT4_MAP_NEW; | ||
734 | |||
735 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
736 | got_it: | ||
737 | map->m_flags |= EXT4_MAP_MAPPED; | ||
738 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
739 | map->m_len = count; | ||
740 | if (count > blocks_to_boundary) | ||
741 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
742 | err = count; | ||
743 | /* Clean up and exit */ | ||
744 | partial = chain + depth - 1; /* the whole chain */ | ||
745 | cleanup: | ||
746 | while (partial > chain) { | ||
747 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
748 | brelse(partial->bh); | ||
749 | partial--; | ||
750 | } | ||
751 | out: | ||
752 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
753 | map->m_pblk, map->m_len, err); | ||
754 | return err; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * O_DIRECT for ext3 (or indirect map) based files | ||
759 | * | ||
760 | * If the O_DIRECT write will extend the file then add this inode to the | ||
761 | * orphan list. So recovery will truncate it back to the original size | ||
762 | * if the machine crashes during the write. | ||
763 | * | ||
764 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
765 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
766 | * VFS code falls back into buffered path in that case so we are safe. | ||
767 | */ | ||
768 | ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
769 | const struct iovec *iov, loff_t offset, | ||
770 | unsigned long nr_segs) | ||
771 | { | ||
772 | struct file *file = iocb->ki_filp; | ||
773 | struct inode *inode = file->f_mapping->host; | ||
774 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
775 | handle_t *handle; | ||
776 | ssize_t ret; | ||
777 | int orphan = 0; | ||
778 | size_t count = iov_length(iov, nr_segs); | ||
779 | int retries = 0; | ||
780 | |||
781 | if (rw == WRITE) { | ||
782 | loff_t final_size = offset + count; | ||
783 | |||
784 | if (final_size > inode->i_size) { | ||
785 | /* Credits for sb + inode write */ | ||
786 | handle = ext4_journal_start(inode, 2); | ||
787 | if (IS_ERR(handle)) { | ||
788 | ret = PTR_ERR(handle); | ||
789 | goto out; | ||
790 | } | ||
791 | ret = ext4_orphan_add(handle, inode); | ||
792 | if (ret) { | ||
793 | ext4_journal_stop(handle); | ||
794 | goto out; | ||
795 | } | ||
796 | orphan = 1; | ||
797 | ei->i_disksize = inode->i_size; | ||
798 | ext4_journal_stop(handle); | ||
799 | } | ||
800 | } | ||
801 | |||
802 | retry: | ||
803 | if (rw == READ && ext4_should_dioread_nolock(inode)) { | ||
804 | if (unlikely(!list_empty(&ei->i_completed_io_list))) { | ||
805 | mutex_lock(&inode->i_mutex); | ||
806 | ext4_flush_completed_IO(inode); | ||
807 | mutex_unlock(&inode->i_mutex); | ||
808 | } | ||
809 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
810 | inode->i_sb->s_bdev, iov, | ||
811 | offset, nr_segs, | ||
812 | ext4_get_block, NULL, NULL, 0); | ||
813 | } else { | ||
814 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
815 | offset, nr_segs, ext4_get_block); | ||
816 | |||
817 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
818 | loff_t isize = i_size_read(inode); | ||
819 | loff_t end = offset + iov_length(iov, nr_segs); | ||
820 | |||
821 | if (end > isize) | ||
822 | ext4_truncate_failed_write(inode); | ||
823 | } | ||
824 | } | ||
825 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
826 | goto retry; | ||
827 | |||
828 | if (orphan) { | ||
829 | int err; | ||
830 | |||
831 | /* Credits for sb + inode write */ | ||
832 | handle = ext4_journal_start(inode, 2); | ||
833 | if (IS_ERR(handle)) { | ||
834 | /* This is really bad luck. We've written the data | ||
835 | * but cannot extend i_size. Bail out and pretend | ||
836 | * the write failed... */ | ||
837 | ret = PTR_ERR(handle); | ||
838 | if (inode->i_nlink) | ||
839 | ext4_orphan_del(NULL, inode); | ||
840 | |||
841 | goto out; | ||
842 | } | ||
843 | if (inode->i_nlink) | ||
844 | ext4_orphan_del(handle, inode); | ||
845 | if (ret > 0) { | ||
846 | loff_t end = offset + ret; | ||
847 | if (end > inode->i_size) { | ||
848 | ei->i_disksize = end; | ||
849 | i_size_write(inode, end); | ||
850 | /* | ||
851 | * We're going to return a positive `ret' | ||
852 | * here due to non-zero-length I/O, so there's | ||
853 | * no way of reporting error returns from | ||
854 | * ext4_mark_inode_dirty() to userspace. So | ||
855 | * ignore it. | ||
856 | */ | ||
857 | ext4_mark_inode_dirty(handle, inode); | ||
858 | } | ||
859 | } | ||
860 | err = ext4_journal_stop(handle); | ||
861 | if (ret == 0) | ||
862 | ret = err; | ||
863 | } | ||
864 | out: | ||
865 | return ret; | ||
866 | } | ||
867 | |||
868 | /* | ||
869 | * Calculate the number of metadata blocks need to reserve | ||
870 | * to allocate a new block at @lblocks for non extent file based file | ||
871 | */ | ||
872 | int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) | ||
873 | { | ||
874 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
875 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
876 | int blk_bits; | ||
877 | |||
878 | if (lblock < EXT4_NDIR_BLOCKS) | ||
879 | return 0; | ||
880 | |||
881 | lblock -= EXT4_NDIR_BLOCKS; | ||
882 | |||
883 | if (ei->i_da_metadata_calc_len && | ||
884 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
885 | ei->i_da_metadata_calc_len++; | ||
886 | return 0; | ||
887 | } | ||
888 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
889 | ei->i_da_metadata_calc_len = 1; | ||
890 | blk_bits = order_base_2(lblock); | ||
891 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
892 | } | ||
893 | |||
894 | int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) | ||
895 | { | ||
896 | int indirects; | ||
897 | |||
898 | /* if nrblocks are contiguous */ | ||
899 | if (chunk) { | ||
900 | /* | ||
901 | * With N contiguous data blocks, we need at most | ||
902 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
903 | * 2 dindirect blocks, and 1 tindirect block | ||
904 | */ | ||
905 | return DIV_ROUND_UP(nrblocks, | ||
906 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
907 | } | ||
908 | /* | ||
909 | * if nrblocks are not contiguous, worse case, each block touch | ||
910 | * a indirect block, and each indirect block touch a double indirect | ||
911 | * block, plus a triple indirect block | ||
912 | */ | ||
913 | indirects = nrblocks * 2 + 1; | ||
914 | return indirects; | ||
915 | } | ||
916 | |||
917 | /* | ||
918 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
919 | * be able to restart the transaction at a conventient checkpoint to make | ||
920 | * sure we don't overflow the journal. | ||
921 | * | ||
922 | * start_transaction gets us a new handle for a truncate transaction, | ||
923 | * and extend_transaction tries to extend the existing one a bit. If | ||
924 | * extend fails, we need to propagate the failure up and restart the | ||
925 | * transaction in the top-level truncate loop. --sct | ||
926 | */ | ||
927 | static handle_t *start_transaction(struct inode *inode) | ||
928 | { | ||
929 | handle_t *result; | ||
930 | |||
931 | result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); | ||
932 | if (!IS_ERR(result)) | ||
933 | return result; | ||
934 | |||
935 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
936 | return result; | ||
937 | } | ||
938 | |||
939 | /* | ||
940 | * Try to extend this transaction for the purposes of truncation. | ||
941 | * | ||
942 | * Returns 0 if we managed to create more room. If we can't create more | ||
943 | * room, and the transaction must be restarted we return 1. | ||
944 | */ | ||
945 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
946 | { | ||
947 | if (!ext4_handle_valid(handle)) | ||
948 | return 0; | ||
949 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
950 | return 0; | ||
951 | if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) | ||
952 | return 0; | ||
953 | return 1; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * Probably it should be a library function... search for first non-zero word | ||
958 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
959 | * Linus? | ||
960 | */ | ||
961 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
962 | { | ||
963 | while (p < q) | ||
964 | if (*p++) | ||
965 | return 0; | ||
966 | return 1; | ||
967 | } | ||
968 | |||
969 | /** | ||
970 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
971 | * @inode: inode in question | ||
972 | * @depth: depth of the affected branch | ||
973 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
974 | * @chain: place to store the pointers to partial indirect blocks | ||
975 | * @top: place to the (detached) top of branch | ||
976 | * | ||
977 | * This is a helper function used by ext4_truncate(). | ||
978 | * | ||
979 | * When we do truncate() we may have to clean the ends of several | ||
980 | * indirect blocks but leave the blocks themselves alive. Block is | ||
981 | * partially truncated if some data below the new i_size is referred | ||
982 | * from it (and it is on the path to the first completely truncated | ||
983 | * data block, indeed). We have to free the top of that path along | ||
984 | * with everything to the right of the path. Since no allocation | ||
985 | * past the truncation point is possible until ext4_truncate() | ||
986 | * finishes, we may safely do the latter, but top of branch may | ||
987 | * require special attention - pageout below the truncation point | ||
988 | * might try to populate it. | ||
989 | * | ||
990 | * We atomically detach the top of branch from the tree, store the | ||
991 | * block number of its root in *@top, pointers to buffer_heads of | ||
992 | * partially truncated blocks - in @chain[].bh and pointers to | ||
993 | * their last elements that should not be removed - in | ||
994 | * @chain[].p. Return value is the pointer to last filled element | ||
995 | * of @chain. | ||
996 | * | ||
997 | * The work left to caller to do the actual freeing of subtrees: | ||
998 | * a) free the subtree starting from *@top | ||
999 | * b) free the subtrees whose roots are stored in | ||
1000 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
1001 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
1002 | * (no partially truncated stuff there). */ | ||
1003 | |||
1004 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
1005 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
1006 | __le32 *top) | ||
1007 | { | ||
1008 | Indirect *partial, *p; | ||
1009 | int k, err; | ||
1010 | |||
1011 | *top = 0; | ||
1012 | /* Make k index the deepest non-null offset + 1 */ | ||
1013 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
1014 | ; | ||
1015 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
1016 | /* Writer: pointers */ | ||
1017 | if (!partial) | ||
1018 | partial = chain + k-1; | ||
1019 | /* | ||
1020 | * If the branch acquired continuation since we've looked at it - | ||
1021 | * fine, it should all survive and (new) top doesn't belong to us. | ||
1022 | */ | ||
1023 | if (!partial->key && *partial->p) | ||
1024 | /* Writer: end */ | ||
1025 | goto no_top; | ||
1026 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
1027 | ; | ||
1028 | /* | ||
1029 | * OK, we've found the last block that must survive. The rest of our | ||
1030 | * branch should be detached before unlocking. However, if that rest | ||
1031 | * of branch is all ours and does not grow immediately from the inode | ||
1032 | * it's easier to cheat and just decrement partial->p. | ||
1033 | */ | ||
1034 | if (p == chain + k - 1 && p > chain) { | ||
1035 | p->p--; | ||
1036 | } else { | ||
1037 | *top = *p->p; | ||
1038 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
1039 | #if 0 | ||
1040 | *p->p = 0; | ||
1041 | #endif | ||
1042 | } | ||
1043 | /* Writer: end */ | ||
1044 | |||
1045 | while (partial > p) { | ||
1046 | brelse(partial->bh); | ||
1047 | partial--; | ||
1048 | } | ||
1049 | no_top: | ||
1050 | return partial; | ||
1051 | } | ||
1052 | |||
1053 | /* | ||
1054 | * Zero a number of block pointers in either an inode or an indirect block. | ||
1055 | * If we restart the transaction we must again get write access to the | ||
1056 | * indirect block for further modification. | ||
1057 | * | ||
1058 | * We release `count' blocks on disk, but (last - first) may be greater | ||
1059 | * than `count' because there can be holes in there. | ||
1060 | * | ||
1061 | * Return 0 on success, 1 on invalid block range | ||
1062 | * and < 0 on fatal error. | ||
1063 | */ | ||
1064 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
1065 | struct buffer_head *bh, | ||
1066 | ext4_fsblk_t block_to_free, | ||
1067 | unsigned long count, __le32 *first, | ||
1068 | __le32 *last) | ||
1069 | { | ||
1070 | __le32 *p; | ||
1071 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
1072 | int err; | ||
1073 | |||
1074 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
1075 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
1076 | |||
1077 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
1078 | count)) { | ||
1079 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
1080 | "blocks %llu len %lu", | ||
1081 | (unsigned long long) block_to_free, count); | ||
1082 | return 1; | ||
1083 | } | ||
1084 | |||
1085 | if (try_to_extend_transaction(handle, inode)) { | ||
1086 | if (bh) { | ||
1087 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1088 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
1089 | if (unlikely(err)) | ||
1090 | goto out_err; | ||
1091 | } | ||
1092 | err = ext4_mark_inode_dirty(handle, inode); | ||
1093 | if (unlikely(err)) | ||
1094 | goto out_err; | ||
1095 | err = ext4_truncate_restart_trans(handle, inode, | ||
1096 | ext4_blocks_for_truncate(inode)); | ||
1097 | if (unlikely(err)) | ||
1098 | goto out_err; | ||
1099 | if (bh) { | ||
1100 | BUFFER_TRACE(bh, "retaking write access"); | ||
1101 | err = ext4_journal_get_write_access(handle, bh); | ||
1102 | if (unlikely(err)) | ||
1103 | goto out_err; | ||
1104 | } | ||
1105 | } | ||
1106 | |||
1107 | for (p = first; p < last; p++) | ||
1108 | *p = 0; | ||
1109 | |||
1110 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
1111 | return 0; | ||
1112 | out_err: | ||
1113 | ext4_std_error(inode->i_sb, err); | ||
1114 | return err; | ||
1115 | } | ||
1116 | |||
1117 | /** | ||
1118 | * ext4_free_data - free a list of data blocks | ||
1119 | * @handle: handle for this transaction | ||
1120 | * @inode: inode we are dealing with | ||
1121 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
1122 | * @first: array of block numbers | ||
1123 | * @last: points immediately past the end of array | ||
1124 | * | ||
1125 | * We are freeing all blocks referred from that array (numbers are stored as | ||
1126 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
1127 | * | ||
1128 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
1129 | * blocks are contiguous then releasing them at one time will only affect one | ||
1130 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
1131 | * actually use a lot of journal space. | ||
1132 | * | ||
1133 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
1134 | * block pointers. | ||
1135 | */ | ||
1136 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
1137 | struct buffer_head *this_bh, | ||
1138 | __le32 *first, __le32 *last) | ||
1139 | { | ||
1140 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
1141 | unsigned long count = 0; /* Number of blocks in the run */ | ||
1142 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
1143 | corresponding to | ||
1144 | block_to_free */ | ||
1145 | ext4_fsblk_t nr; /* Current block # */ | ||
1146 | __le32 *p; /* Pointer into inode/ind | ||
1147 | for current block */ | ||
1148 | int err = 0; | ||
1149 | |||
1150 | if (this_bh) { /* For indirect block */ | ||
1151 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
1152 | err = ext4_journal_get_write_access(handle, this_bh); | ||
1153 | /* Important: if we can't update the indirect pointers | ||
1154 | * to the blocks, we can't free them. */ | ||
1155 | if (err) | ||
1156 | return; | ||
1157 | } | ||
1158 | |||
1159 | for (p = first; p < last; p++) { | ||
1160 | nr = le32_to_cpu(*p); | ||
1161 | if (nr) { | ||
1162 | /* accumulate blocks to free if they're contiguous */ | ||
1163 | if (count == 0) { | ||
1164 | block_to_free = nr; | ||
1165 | block_to_free_p = p; | ||
1166 | count = 1; | ||
1167 | } else if (nr == block_to_free + count) { | ||
1168 | count++; | ||
1169 | } else { | ||
1170 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
1171 | block_to_free, count, | ||
1172 | block_to_free_p, p); | ||
1173 | if (err) | ||
1174 | break; | ||
1175 | block_to_free = nr; | ||
1176 | block_to_free_p = p; | ||
1177 | count = 1; | ||
1178 | } | ||
1179 | } | ||
1180 | } | ||
1181 | |||
1182 | if (!err && count > 0) | ||
1183 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
1184 | count, block_to_free_p, p); | ||
1185 | if (err < 0) | ||
1186 | /* fatal error */ | ||
1187 | return; | ||
1188 | |||
1189 | if (this_bh) { | ||
1190 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
1191 | |||
1192 | /* | ||
1193 | * The buffer head should have an attached journal head at this | ||
1194 | * point. However, if the data is corrupted and an indirect | ||
1195 | * block pointed to itself, it would have been detached when | ||
1196 | * the block was cleared. Check for this instead of OOPSing. | ||
1197 | */ | ||
1198 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
1199 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
1200 | else | ||
1201 | EXT4_ERROR_INODE(inode, | ||
1202 | "circular indirect block detected at " | ||
1203 | "block %llu", | ||
1204 | (unsigned long long) this_bh->b_blocknr); | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | /** | ||
1209 | * ext4_free_branches - free an array of branches | ||
1210 | * @handle: JBD handle for this transaction | ||
1211 | * @inode: inode we are dealing with | ||
1212 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
1213 | * @first: array of block numbers | ||
1214 | * @last: pointer immediately past the end of array | ||
1215 | * @depth: depth of the branches to free | ||
1216 | * | ||
1217 | * We are freeing all blocks referred from these branches (numbers are | ||
1218 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
1219 | * appropriately. | ||
1220 | */ | ||
1221 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
1222 | struct buffer_head *parent_bh, | ||
1223 | __le32 *first, __le32 *last, int depth) | ||
1224 | { | ||
1225 | ext4_fsblk_t nr; | ||
1226 | __le32 *p; | ||
1227 | |||
1228 | if (ext4_handle_is_aborted(handle)) | ||
1229 | return; | ||
1230 | |||
1231 | if (depth--) { | ||
1232 | struct buffer_head *bh; | ||
1233 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1234 | p = last; | ||
1235 | while (--p >= first) { | ||
1236 | nr = le32_to_cpu(*p); | ||
1237 | if (!nr) | ||
1238 | continue; /* A hole */ | ||
1239 | |||
1240 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
1241 | nr, 1)) { | ||
1242 | EXT4_ERROR_INODE(inode, | ||
1243 | "invalid indirect mapped " | ||
1244 | "block %lu (level %d)", | ||
1245 | (unsigned long) nr, depth); | ||
1246 | break; | ||
1247 | } | ||
1248 | |||
1249 | /* Go read the buffer for the next level down */ | ||
1250 | bh = sb_bread(inode->i_sb, nr); | ||
1251 | |||
1252 | /* | ||
1253 | * A read failure? Report error and clear slot | ||
1254 | * (should be rare). | ||
1255 | */ | ||
1256 | if (!bh) { | ||
1257 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
1258 | "Read failure"); | ||
1259 | continue; | ||
1260 | } | ||
1261 | |||
1262 | /* This zaps the entire block. Bottom up. */ | ||
1263 | BUFFER_TRACE(bh, "free child branches"); | ||
1264 | ext4_free_branches(handle, inode, bh, | ||
1265 | (__le32 *) bh->b_data, | ||
1266 | (__le32 *) bh->b_data + addr_per_block, | ||
1267 | depth); | ||
1268 | brelse(bh); | ||
1269 | |||
1270 | /* | ||
1271 | * Everything below this this pointer has been | ||
1272 | * released. Now let this top-of-subtree go. | ||
1273 | * | ||
1274 | * We want the freeing of this indirect block to be | ||
1275 | * atomic in the journal with the updating of the | ||
1276 | * bitmap block which owns it. So make some room in | ||
1277 | * the journal. | ||
1278 | * | ||
1279 | * We zero the parent pointer *after* freeing its | ||
1280 | * pointee in the bitmaps, so if extend_transaction() | ||
1281 | * for some reason fails to put the bitmap changes and | ||
1282 | * the release into the same transaction, recovery | ||
1283 | * will merely complain about releasing a free block, | ||
1284 | * rather than leaking blocks. | ||
1285 | */ | ||
1286 | if (ext4_handle_is_aborted(handle)) | ||
1287 | return; | ||
1288 | if (try_to_extend_transaction(handle, inode)) { | ||
1289 | ext4_mark_inode_dirty(handle, inode); | ||
1290 | ext4_truncate_restart_trans(handle, inode, | ||
1291 | ext4_blocks_for_truncate(inode)); | ||
1292 | } | ||
1293 | |||
1294 | /* | ||
1295 | * The forget flag here is critical because if | ||
1296 | * we are journaling (and not doing data | ||
1297 | * journaling), we have to make sure a revoke | ||
1298 | * record is written to prevent the journal | ||
1299 | * replay from overwriting the (former) | ||
1300 | * indirect block if it gets reallocated as a | ||
1301 | * data block. This must happen in the same | ||
1302 | * transaction where the data blocks are | ||
1303 | * actually freed. | ||
1304 | */ | ||
1305 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
1306 | EXT4_FREE_BLOCKS_METADATA| | ||
1307 | EXT4_FREE_BLOCKS_FORGET); | ||
1308 | |||
1309 | if (parent_bh) { | ||
1310 | /* | ||
1311 | * The block which we have just freed is | ||
1312 | * pointed to by an indirect block: journal it | ||
1313 | */ | ||
1314 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
1315 | if (!ext4_journal_get_write_access(handle, | ||
1316 | parent_bh)){ | ||
1317 | *p = 0; | ||
1318 | BUFFER_TRACE(parent_bh, | ||
1319 | "call ext4_handle_dirty_metadata"); | ||
1320 | ext4_handle_dirty_metadata(handle, | ||
1321 | inode, | ||
1322 | parent_bh); | ||
1323 | } | ||
1324 | } | ||
1325 | } | ||
1326 | } else { | ||
1327 | /* We have reached the bottom of the tree. */ | ||
1328 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
1329 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
1330 | } | ||
1331 | } | ||
1332 | |||
1333 | void ext4_ind_truncate(struct inode *inode) | ||
1334 | { | ||
1335 | handle_t *handle; | ||
1336 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1337 | __le32 *i_data = ei->i_data; | ||
1338 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1339 | struct address_space *mapping = inode->i_mapping; | ||
1340 | ext4_lblk_t offsets[4]; | ||
1341 | Indirect chain[4]; | ||
1342 | Indirect *partial; | ||
1343 | __le32 nr = 0; | ||
1344 | int n = 0; | ||
1345 | ext4_lblk_t last_block, max_block; | ||
1346 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
1347 | |||
1348 | handle = start_transaction(inode); | ||
1349 | if (IS_ERR(handle)) | ||
1350 | return; /* AKPM: return what? */ | ||
1351 | |||
1352 | last_block = (inode->i_size + blocksize-1) | ||
1353 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1354 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
1355 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1356 | |||
1357 | if (inode->i_size & (blocksize - 1)) | ||
1358 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
1359 | goto out_stop; | ||
1360 | |||
1361 | if (last_block != max_block) { | ||
1362 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
1363 | if (n == 0) | ||
1364 | goto out_stop; /* error */ | ||
1365 | } | ||
1366 | |||
1367 | /* | ||
1368 | * OK. This truncate is going to happen. We add the inode to the | ||
1369 | * orphan list, so that if this truncate spans multiple transactions, | ||
1370 | * and we crash, we will resume the truncate when the filesystem | ||
1371 | * recovers. It also marks the inode dirty, to catch the new size. | ||
1372 | * | ||
1373 | * Implication: the file must always be in a sane, consistent | ||
1374 | * truncatable state while each transaction commits. | ||
1375 | */ | ||
1376 | if (ext4_orphan_add(handle, inode)) | ||
1377 | goto out_stop; | ||
1378 | |||
1379 | /* | ||
1380 | * From here we block out all ext4_get_block() callers who want to | ||
1381 | * modify the block allocation tree. | ||
1382 | */ | ||
1383 | down_write(&ei->i_data_sem); | ||
1384 | |||
1385 | ext4_discard_preallocations(inode); | ||
1386 | |||
1387 | /* | ||
1388 | * The orphan list entry will now protect us from any crash which | ||
1389 | * occurs before the truncate completes, so it is now safe to propagate | ||
1390 | * the new, shorter inode size (held for now in i_size) into the | ||
1391 | * on-disk inode. We do this via i_disksize, which is the value which | ||
1392 | * ext4 *really* writes onto the disk inode. | ||
1393 | */ | ||
1394 | ei->i_disksize = inode->i_size; | ||
1395 | |||
1396 | if (last_block == max_block) { | ||
1397 | /* | ||
1398 | * It is unnecessary to free any data blocks if last_block is | ||
1399 | * equal to the indirect block limit. | ||
1400 | */ | ||
1401 | goto out_unlock; | ||
1402 | } else if (n == 1) { /* direct blocks */ | ||
1403 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
1404 | i_data + EXT4_NDIR_BLOCKS); | ||
1405 | goto do_indirects; | ||
1406 | } | ||
1407 | |||
1408 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
1409 | /* Kill the top of shared branch (not detached) */ | ||
1410 | if (nr) { | ||
1411 | if (partial == chain) { | ||
1412 | /* Shared branch grows from the inode */ | ||
1413 | ext4_free_branches(handle, inode, NULL, | ||
1414 | &nr, &nr+1, (chain+n-1) - partial); | ||
1415 | *partial->p = 0; | ||
1416 | /* | ||
1417 | * We mark the inode dirty prior to restart, | ||
1418 | * and prior to stop. No need for it here. | ||
1419 | */ | ||
1420 | } else { | ||
1421 | /* Shared branch grows from an indirect block */ | ||
1422 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
1423 | ext4_free_branches(handle, inode, partial->bh, | ||
1424 | partial->p, | ||
1425 | partial->p+1, (chain+n-1) - partial); | ||
1426 | } | ||
1427 | } | ||
1428 | /* Clear the ends of indirect blocks on the shared branch */ | ||
1429 | while (partial > chain) { | ||
1430 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
1431 | (__le32*)partial->bh->b_data+addr_per_block, | ||
1432 | (chain+n-1) - partial); | ||
1433 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1434 | brelse(partial->bh); | ||
1435 | partial--; | ||
1436 | } | ||
1437 | do_indirects: | ||
1438 | /* Kill the remaining (whole) subtrees */ | ||
1439 | switch (offsets[0]) { | ||
1440 | default: | ||
1441 | nr = i_data[EXT4_IND_BLOCK]; | ||
1442 | if (nr) { | ||
1443 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
1444 | i_data[EXT4_IND_BLOCK] = 0; | ||
1445 | } | ||
1446 | case EXT4_IND_BLOCK: | ||
1447 | nr = i_data[EXT4_DIND_BLOCK]; | ||
1448 | if (nr) { | ||
1449 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
1450 | i_data[EXT4_DIND_BLOCK] = 0; | ||
1451 | } | ||
1452 | case EXT4_DIND_BLOCK: | ||
1453 | nr = i_data[EXT4_TIND_BLOCK]; | ||
1454 | if (nr) { | ||
1455 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
1456 | i_data[EXT4_TIND_BLOCK] = 0; | ||
1457 | } | ||
1458 | case EXT4_TIND_BLOCK: | ||
1459 | ; | ||
1460 | } | ||
1461 | |||
1462 | out_unlock: | ||
1463 | up_write(&ei->i_data_sem); | ||
1464 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
1465 | ext4_mark_inode_dirty(handle, inode); | ||
1466 | |||
1467 | /* | ||
1468 | * In a multi-transaction truncate, we only make the final transaction | ||
1469 | * synchronous | ||
1470 | */ | ||
1471 | if (IS_SYNC(inode)) | ||
1472 | ext4_handle_sync(handle); | ||
1473 | out_stop: | ||
1474 | /* | ||
1475 | * If this was a simple ftruncate(), and the file will remain alive | ||
1476 | * then we need to clear up the orphan record which we created above. | ||
1477 | * However, if this was a real unlink then we were called by | ||
1478 | * ext4_delete_inode(), and we allow that function to clean up the | ||
1479 | * orphan info for us. | ||
1480 | */ | ||
1481 | if (inode->i_nlink) | ||
1482 | ext4_orphan_del(handle, inode); | ||
1483 | |||
1484 | ext4_journal_stop(handle); | ||
1485 | trace_ext4_truncate_exit(inode); | ||
1486 | } | ||
1487 | |||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e3126c05100..5dbdb6b91ae 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -12,10 +12,6 @@ | |||
12 | * | 12 | * |
13 | * Copyright (C) 1991, 1992 Linus Torvalds | 13 | * Copyright (C) 1991, 1992 Linus Torvalds |
14 | * | 14 | * |
15 | * Goal-directed block allocation by Stephen Tweedie | ||
16 | * (sct@redhat.com), 1993, 1998 | ||
17 | * Big-endian to little-endian byte-swapping/bitmaps by | ||
18 | * David S. Miller (davem@caip.rutgers.edu), 1995 | ||
19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek | 15 | * 64-bit file support on 64-bit platforms by Jakub Jelinek |
20 | * (jj@sunsite.ms.mff.cuni.cz) | 16 | * (jj@sunsite.ms.mff.cuni.cz) |
21 | * | 17 | * |
@@ -47,6 +43,7 @@ | |||
47 | #include "xattr.h" | 43 | #include "xattr.h" |
48 | #include "acl.h" | 44 | #include "acl.h" |
49 | #include "ext4_extents.h" | 45 | #include "ext4_extents.h" |
46 | #include "truncate.h" | ||
50 | 47 | ||
51 | #include <trace/events/ext4.h> | 48 | #include <trace/events/ext4.h> |
52 | 49 | ||
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Work out how many blocks we need to proceed with the next chunk of a | ||
93 | * truncate transaction. | ||
94 | */ | ||
95 | static unsigned long blocks_for_truncate(struct inode *inode) | ||
96 | { | ||
97 | ext4_lblk_t needed; | ||
98 | |||
99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
100 | |||
101 | /* Give ourselves just enough room to cope with inodes in which | ||
102 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
103 | * which resulted in random data in an inode which looked enough | ||
104 | * like a regular file for ext4 to try to delete it. Things | ||
105 | * will go a bit crazy if that happens, but at least we should | ||
106 | * try not to panic the whole kernel. */ | ||
107 | if (needed < 2) | ||
108 | needed = 2; | ||
109 | |||
110 | /* But we need to bound the transaction so we don't overflow the | ||
111 | * journal. */ | ||
112 | if (needed > EXT4_MAX_TRANS_DATA) | ||
113 | needed = EXT4_MAX_TRANS_DATA; | ||
114 | |||
115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
120 | * be able to restart the transaction at a conventient checkpoint to make | ||
121 | * sure we don't overflow the journal. | ||
122 | * | ||
123 | * start_transaction gets us a new handle for a truncate transaction, | ||
124 | * and extend_transaction tries to extend the existing one a bit. If | ||
125 | * extend fails, we need to propagate the failure up and restart the | ||
126 | * transaction in the top-level truncate loop. --sct | ||
127 | */ | ||
128 | static handle_t *start_transaction(struct inode *inode) | ||
129 | { | ||
130 | handle_t *result; | ||
131 | |||
132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); | ||
133 | if (!IS_ERR(result)) | ||
134 | return result; | ||
135 | |||
136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
137 | return result; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Try to extend this transaction for the purposes of truncation. | ||
142 | * | ||
143 | * Returns 0 if we managed to create more room. If we can't create more | ||
144 | * room, and the transaction must be restarted we return 1. | ||
145 | */ | ||
146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
147 | { | ||
148 | if (!ext4_handle_valid(handle)) | ||
149 | return 0; | ||
150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
151 | return 0; | ||
152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | ||
153 | return 0; | ||
154 | return 1; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Restart the transaction associated with *handle. This does a commit, | 89 | * Restart the transaction associated with *handle. This does a commit, |
159 | * so before we call here everything must be consistently dirtied against | 90 | * so before we call here everything must be consistently dirtied against |
160 | * this transaction. | 91 | * this transaction. |
@@ -189,7 +120,37 @@ void ext4_evict_inode(struct inode *inode) | |||
189 | int err; | 120 | int err; |
190 | 121 | ||
191 | trace_ext4_evict_inode(inode); | 122 | trace_ext4_evict_inode(inode); |
123 | |||
124 | ext4_ioend_wait(inode); | ||
125 | |||
192 | if (inode->i_nlink) { | 126 | if (inode->i_nlink) { |
127 | /* | ||
128 | * When journalling data dirty buffers are tracked only in the | ||
129 | * journal. So although mm thinks everything is clean and | ||
130 | * ready for reaping the inode might still have some pages to | ||
131 | * write in the running transaction or waiting to be | ||
132 | * checkpointed. Thus calling jbd2_journal_invalidatepage() | ||
133 | * (via truncate_inode_pages()) to discard these buffers can | ||
134 | * cause data loss. Also even if we did not discard these | ||
135 | * buffers, we would have no way to find them after the inode | ||
136 | * is reaped and thus user could see stale data if he tries to | ||
137 | * read them before the transaction is checkpointed. So be | ||
138 | * careful and force everything to disk here... We use | ||
139 | * ei->i_datasync_tid to store the newest transaction | ||
140 | * containing inode's data. | ||
141 | * | ||
142 | * Note that directories do not have this problem because they | ||
143 | * don't use page cache. | ||
144 | */ | ||
145 | if (ext4_should_journal_data(inode) && | ||
146 | (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { | ||
147 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
148 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; | ||
149 | |||
150 | jbd2_log_start_commit(journal, commit_tid); | ||
151 | jbd2_log_wait_commit(journal, commit_tid); | ||
152 | filemap_write_and_wait(&inode->i_data); | ||
153 | } | ||
193 | truncate_inode_pages(&inode->i_data, 0); | 154 | truncate_inode_pages(&inode->i_data, 0); |
194 | goto no_delete; | 155 | goto no_delete; |
195 | } | 156 | } |
@@ -204,7 +165,7 @@ void ext4_evict_inode(struct inode *inode) | |||
204 | if (is_bad_inode(inode)) | 165 | if (is_bad_inode(inode)) |
205 | goto no_delete; | 166 | goto no_delete; |
206 | 167 | ||
207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | 168 | handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); |
208 | if (IS_ERR(handle)) { | 169 | if (IS_ERR(handle)) { |
209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | 170 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); |
210 | /* | 171 | /* |
@@ -277,793 +238,6 @@ no_delete: | |||
277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ | 238 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
278 | } | 239 | } |
279 | 240 | ||
280 | typedef struct { | ||
281 | __le32 *p; | ||
282 | __le32 key; | ||
283 | struct buffer_head *bh; | ||
284 | } Indirect; | ||
285 | |||
286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
287 | { | ||
288 | p->key = *(p->p = v); | ||
289 | p->bh = bh; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * ext4_block_to_path - parse the block number into array of offsets | ||
294 | * @inode: inode in question (we are only interested in its superblock) | ||
295 | * @i_block: block number to be parsed | ||
296 | * @offsets: array to store the offsets in | ||
297 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
298 | * followed (on disk) by an indirect block. | ||
299 | * | ||
300 | * To store the locations of file's data ext4 uses a data structure common | ||
301 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
302 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
303 | * This function translates the block number into path in that tree - | ||
304 | * return value is the path length and @offsets[n] is the offset of | ||
305 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
306 | * (negative or too large) warning is printed and zero returned. | ||
307 | * | ||
308 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
309 | * we need to know is the capacity of indirect blocks (taken from the | ||
310 | * inode->i_sb). | ||
311 | */ | ||
312 | |||
313 | /* | ||
314 | * Portability note: the last comparison (check that we fit into triple | ||
315 | * indirect block) is spelled differently, because otherwise on an | ||
316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
317 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
319 | * i_block would have to be negative in the very beginning, so we would not | ||
320 | * get there at all. | ||
321 | */ | ||
322 | |||
323 | static int ext4_block_to_path(struct inode *inode, | ||
324 | ext4_lblk_t i_block, | ||
325 | ext4_lblk_t offsets[4], int *boundary) | ||
326 | { | ||
327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
329 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
330 | indirect_blocks = ptrs, | ||
331 | double_blocks = (1 << (ptrs_bits * 2)); | ||
332 | int n = 0; | ||
333 | int final = 0; | ||
334 | |||
335 | if (i_block < direct_blocks) { | ||
336 | offsets[n++] = i_block; | ||
337 | final = direct_blocks; | ||
338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
339 | offsets[n++] = EXT4_IND_BLOCK; | ||
340 | offsets[n++] = i_block; | ||
341 | final = ptrs; | ||
342 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
343 | offsets[n++] = EXT4_DIND_BLOCK; | ||
344 | offsets[n++] = i_block >> ptrs_bits; | ||
345 | offsets[n++] = i_block & (ptrs - 1); | ||
346 | final = ptrs; | ||
347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
348 | offsets[n++] = EXT4_TIND_BLOCK; | ||
349 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
351 | offsets[n++] = i_block & (ptrs - 1); | ||
352 | final = ptrs; | ||
353 | } else { | ||
354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
355 | i_block + direct_blocks + | ||
356 | indirect_blocks + double_blocks, inode->i_ino); | ||
357 | } | ||
358 | if (boundary) | ||
359 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
360 | return n; | ||
361 | } | ||
362 | |||
363 | static int __ext4_check_blockref(const char *function, unsigned int line, | ||
364 | struct inode *inode, | ||
365 | __le32 *p, unsigned int max) | ||
366 | { | ||
367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
368 | __le32 *bref = p; | ||
369 | unsigned int blk; | ||
370 | |||
371 | while (bref < p+max) { | ||
372 | blk = le32_to_cpu(*bref++); | ||
373 | if (blk && | ||
374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
375 | blk, 1))) { | ||
376 | es->s_last_error_block = cpu_to_le64(blk); | ||
377 | ext4_error_inode(inode, function, line, blk, | ||
378 | "invalid block"); | ||
379 | return -EIO; | ||
380 | } | ||
381 | } | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | |||
386 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
387 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
388 | (__le32 *)(bh)->b_data, \ | ||
389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
390 | |||
391 | #define ext4_check_inode_blockref(inode) \ | ||
392 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
393 | EXT4_I(inode)->i_data, \ | ||
394 | EXT4_NDIR_BLOCKS) | ||
395 | |||
396 | /** | ||
397 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
398 | * @inode: inode in question | ||
399 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
400 | * @offsets: offsets of pointers in inode/indirect blocks | ||
401 | * @chain: place to store the result | ||
402 | * @err: here we store the error value | ||
403 | * | ||
404 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
405 | * if everything went OK or the pointer to the last filled triple | ||
406 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
407 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
409 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
411 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
412 | * numbers of the chain, addresses they were taken from (and where we can | ||
413 | * verify that chain did not change) and buffer_heads hosting these | ||
414 | * numbers. | ||
415 | * | ||
416 | * Function stops when it stumbles upon zero pointer (absent block) | ||
417 | * (pointer to last triple returned, *@err == 0) | ||
418 | * or when it gets an IO error reading an indirect block | ||
419 | * (ditto, *@err == -EIO) | ||
420 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
421 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
422 | * | ||
423 | * Need to be called with | ||
424 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
425 | */ | ||
426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
427 | ext4_lblk_t *offsets, | ||
428 | Indirect chain[4], int *err) | ||
429 | { | ||
430 | struct super_block *sb = inode->i_sb; | ||
431 | Indirect *p = chain; | ||
432 | struct buffer_head *bh; | ||
433 | |||
434 | *err = 0; | ||
435 | /* i_data is not going away, no lock needed */ | ||
436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
437 | if (!p->key) | ||
438 | goto no_block; | ||
439 | while (--depth) { | ||
440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
441 | if (unlikely(!bh)) | ||
442 | goto failure; | ||
443 | |||
444 | if (!bh_uptodate_or_lock(bh)) { | ||
445 | if (bh_submit_read(bh) < 0) { | ||
446 | put_bh(bh); | ||
447 | goto failure; | ||
448 | } | ||
449 | /* validate block references */ | ||
450 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
451 | put_bh(bh); | ||
452 | goto failure; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
457 | /* Reader: end */ | ||
458 | if (!p->key) | ||
459 | goto no_block; | ||
460 | } | ||
461 | return NULL; | ||
462 | |||
463 | failure: | ||
464 | *err = -EIO; | ||
465 | no_block: | ||
466 | return p; | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * ext4_find_near - find a place for allocation with sufficient locality | ||
471 | * @inode: owner | ||
472 | * @ind: descriptor of indirect block. | ||
473 | * | ||
474 | * This function returns the preferred place for block allocation. | ||
475 | * It is used when heuristic for sequential allocation fails. | ||
476 | * Rules are: | ||
477 | * + if there is a block to the left of our position - allocate near it. | ||
478 | * + if pointer will live in indirect block - allocate near that block. | ||
479 | * + if pointer will live in inode - allocate in the same | ||
480 | * cylinder group. | ||
481 | * | ||
482 | * In the latter case we colour the starting block by the callers PID to | ||
483 | * prevent it from clashing with concurrent allocations for a different inode | ||
484 | * in the same block group. The PID is used here so that functionally related | ||
485 | * files will be close-by on-disk. | ||
486 | * | ||
487 | * Caller must make sure that @ind is valid and will stay that way. | ||
488 | */ | ||
489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
490 | { | ||
491 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
493 | __le32 *p; | ||
494 | ext4_fsblk_t bg_start; | ||
495 | ext4_fsblk_t last_block; | ||
496 | ext4_grpblk_t colour; | ||
497 | ext4_group_t block_group; | ||
498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
499 | |||
500 | /* Try to find previous block */ | ||
501 | for (p = ind->p - 1; p >= start; p--) { | ||
502 | if (*p) | ||
503 | return le32_to_cpu(*p); | ||
504 | } | ||
505 | |||
506 | /* No such thing, so let's try location of indirect block */ | ||
507 | if (ind->bh) | ||
508 | return ind->bh->b_blocknr; | ||
509 | |||
510 | /* | ||
511 | * It is going to be referred to from the inode itself? OK, just put it | ||
512 | * into the same cylinder group then. | ||
513 | */ | ||
514 | block_group = ei->i_block_group; | ||
515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
516 | block_group &= ~(flex_size-1); | ||
517 | if (S_ISREG(inode->i_mode)) | ||
518 | block_group++; | ||
519 | } | ||
520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
522 | |||
523 | /* | ||
524 | * If we are doing delayed allocation, we don't need take | ||
525 | * colour into account. | ||
526 | */ | ||
527 | if (test_opt(inode->i_sb, DELALLOC)) | ||
528 | return bg_start; | ||
529 | |||
530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
531 | colour = (current->pid % 16) * | ||
532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
533 | else | ||
534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
535 | return bg_start + colour; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * ext4_find_goal - find a preferred place for allocation. | ||
540 | * @inode: owner | ||
541 | * @block: block we want | ||
542 | * @partial: pointer to the last triple within a chain | ||
543 | * | ||
544 | * Normally this function find the preferred place for block allocation, | ||
545 | * returns it. | ||
546 | * Because this is only used for non-extent files, we limit the block nr | ||
547 | * to 32 bits. | ||
548 | */ | ||
549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
550 | Indirect *partial) | ||
551 | { | ||
552 | ext4_fsblk_t goal; | ||
553 | |||
554 | /* | ||
555 | * XXX need to get goal block from mballoc's data structures | ||
556 | */ | ||
557 | |||
558 | goal = ext4_find_near(inode, partial); | ||
559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
560 | return goal; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
565 | * of direct blocks need to be allocated for the given branch. | ||
566 | * | ||
567 | * @branch: chain of indirect blocks | ||
568 | * @k: number of blocks need for indirect blocks | ||
569 | * @blks: number of data blocks to be mapped. | ||
570 | * @blocks_to_boundary: the offset in the indirect block | ||
571 | * | ||
572 | * return the total number of blocks to be allocate, including the | ||
573 | * direct and indirect blocks. | ||
574 | */ | ||
575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
576 | int blocks_to_boundary) | ||
577 | { | ||
578 | unsigned int count = 0; | ||
579 | |||
580 | /* | ||
581 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
582 | * then it's clear blocks on that path have not allocated | ||
583 | */ | ||
584 | if (k > 0) { | ||
585 | /* right now we don't handle cross boundary allocation */ | ||
586 | if (blks < blocks_to_boundary + 1) | ||
587 | count += blks; | ||
588 | else | ||
589 | count += blocks_to_boundary + 1; | ||
590 | return count; | ||
591 | } | ||
592 | |||
593 | count++; | ||
594 | while (count < blks && count <= blocks_to_boundary && | ||
595 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
596 | count++; | ||
597 | } | ||
598 | return count; | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
603 | * @handle: handle for this transaction | ||
604 | * @inode: inode which needs allocated blocks | ||
605 | * @iblock: the logical block to start allocated at | ||
606 | * @goal: preferred physical block of allocation | ||
607 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
608 | * blocks | ||
609 | * @blks: number of desired blocks | ||
610 | * @new_blocks: on return it will store the new block numbers for | ||
611 | * the indirect blocks(if needed) and the first direct block, | ||
612 | * @err: on return it will store the error code | ||
613 | * | ||
614 | * This function will return the number of blocks allocated as | ||
615 | * requested by the passed-in parameters. | ||
616 | */ | ||
617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
618 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
619 | int indirect_blks, int blks, | ||
620 | ext4_fsblk_t new_blocks[4], int *err) | ||
621 | { | ||
622 | struct ext4_allocation_request ar; | ||
623 | int target, i; | ||
624 | unsigned long count = 0, blk_allocated = 0; | ||
625 | int index = 0; | ||
626 | ext4_fsblk_t current_block = 0; | ||
627 | int ret = 0; | ||
628 | |||
629 | /* | ||
630 | * Here we try to allocate the requested multiple blocks at once, | ||
631 | * on a best-effort basis. | ||
632 | * To build a branch, we should allocate blocks for | ||
633 | * the indirect blocks(if not allocated yet), and at least | ||
634 | * the first direct block of this branch. That's the | ||
635 | * minimum number of blocks need to allocate(required) | ||
636 | */ | ||
637 | /* first we try to allocate the indirect blocks */ | ||
638 | target = indirect_blks; | ||
639 | while (target > 0) { | ||
640 | count = target; | ||
641 | /* allocating blocks for indirect blocks and direct blocks */ | ||
642 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
643 | 0, &count, err); | ||
644 | if (*err) | ||
645 | goto failed_out; | ||
646 | |||
647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
648 | EXT4_ERROR_INODE(inode, | ||
649 | "current_block %llu + count %lu > %d!", | ||
650 | current_block, count, | ||
651 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
652 | *err = -EIO; | ||
653 | goto failed_out; | ||
654 | } | ||
655 | |||
656 | target -= count; | ||
657 | /* allocate blocks for indirect blocks */ | ||
658 | while (index < indirect_blks && count) { | ||
659 | new_blocks[index++] = current_block++; | ||
660 | count--; | ||
661 | } | ||
662 | if (count > 0) { | ||
663 | /* | ||
664 | * save the new block number | ||
665 | * for the first direct block | ||
666 | */ | ||
667 | new_blocks[index] = current_block; | ||
668 | printk(KERN_INFO "%s returned more blocks than " | ||
669 | "requested\n", __func__); | ||
670 | WARN_ON(1); | ||
671 | break; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | target = blks - count ; | ||
676 | blk_allocated = count; | ||
677 | if (!target) | ||
678 | goto allocated; | ||
679 | /* Now allocate data blocks */ | ||
680 | memset(&ar, 0, sizeof(ar)); | ||
681 | ar.inode = inode; | ||
682 | ar.goal = goal; | ||
683 | ar.len = target; | ||
684 | ar.logical = iblock; | ||
685 | if (S_ISREG(inode->i_mode)) | ||
686 | /* enable in-core preallocation only for regular files */ | ||
687 | ar.flags = EXT4_MB_HINT_DATA; | ||
688 | |||
689 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
691 | EXT4_ERROR_INODE(inode, | ||
692 | "current_block %llu + ar.len %d > %d!", | ||
693 | current_block, ar.len, | ||
694 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
695 | *err = -EIO; | ||
696 | goto failed_out; | ||
697 | } | ||
698 | |||
699 | if (*err && (target == blks)) { | ||
700 | /* | ||
701 | * if the allocation failed and we didn't allocate | ||
702 | * any blocks before | ||
703 | */ | ||
704 | goto failed_out; | ||
705 | } | ||
706 | if (!*err) { | ||
707 | if (target == blks) { | ||
708 | /* | ||
709 | * save the new block number | ||
710 | * for the first direct block | ||
711 | */ | ||
712 | new_blocks[index] = current_block; | ||
713 | } | ||
714 | blk_allocated += ar.len; | ||
715 | } | ||
716 | allocated: | ||
717 | /* total number of blocks allocated for direct blocks */ | ||
718 | ret = blk_allocated; | ||
719 | *err = 0; | ||
720 | return ret; | ||
721 | failed_out: | ||
722 | for (i = 0; i < index; i++) | ||
723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
724 | return ret; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
729 | * @handle: handle for this transaction | ||
730 | * @inode: owner | ||
731 | * @indirect_blks: number of allocated indirect blocks | ||
732 | * @blks: number of allocated direct blocks | ||
733 | * @goal: preferred place for allocation | ||
734 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
735 | * @branch: place to store the chain in. | ||
736 | * | ||
737 | * This function allocates blocks, zeroes out all but the last one, | ||
738 | * links them into chain and (if we are synchronous) writes them to disk. | ||
739 | * In other words, it prepares a branch that can be spliced onto the | ||
740 | * inode. It stores the information about that chain in the branch[], in | ||
741 | * the same format as ext4_get_branch() would do. We are calling it after | ||
742 | * we had read the existing part of chain and partial points to the last | ||
743 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
744 | * picture as after the successful ext4_get_block(), except that in one | ||
745 | * place chain is disconnected - *branch->p is still zero (we did not | ||
746 | * set the last link), but branch->key contains the number that should | ||
747 | * be placed into *branch->p to fill that gap. | ||
748 | * | ||
749 | * If allocation fails we free all blocks we've allocated (and forget | ||
750 | * their buffer_heads) and return the error value the from failed | ||
751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
752 | * as described above and return 0. | ||
753 | */ | ||
754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
755 | ext4_lblk_t iblock, int indirect_blks, | ||
756 | int *blks, ext4_fsblk_t goal, | ||
757 | ext4_lblk_t *offsets, Indirect *branch) | ||
758 | { | ||
759 | int blocksize = inode->i_sb->s_blocksize; | ||
760 | int i, n = 0; | ||
761 | int err = 0; | ||
762 | struct buffer_head *bh; | ||
763 | int num; | ||
764 | ext4_fsblk_t new_blocks[4]; | ||
765 | ext4_fsblk_t current_block; | ||
766 | |||
767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
768 | *blks, new_blocks, &err); | ||
769 | if (err) | ||
770 | return err; | ||
771 | |||
772 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
773 | /* | ||
774 | * metadata blocks and data blocks are allocated. | ||
775 | */ | ||
776 | for (n = 1; n <= indirect_blks; n++) { | ||
777 | /* | ||
778 | * Get buffer_head for parent block, zero it out | ||
779 | * and set the pointer to new one, then send | ||
780 | * parent to disk. | ||
781 | */ | ||
782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
783 | if (unlikely(!bh)) { | ||
784 | err = -EIO; | ||
785 | goto failed; | ||
786 | } | ||
787 | |||
788 | branch[n].bh = bh; | ||
789 | lock_buffer(bh); | ||
790 | BUFFER_TRACE(bh, "call get_create_access"); | ||
791 | err = ext4_journal_get_create_access(handle, bh); | ||
792 | if (err) { | ||
793 | /* Don't brelse(bh) here; it's done in | ||
794 | * ext4_journal_forget() below */ | ||
795 | unlock_buffer(bh); | ||
796 | goto failed; | ||
797 | } | ||
798 | |||
799 | memset(bh->b_data, 0, blocksize); | ||
800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
801 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
802 | *branch[n].p = branch[n].key; | ||
803 | if (n == indirect_blks) { | ||
804 | current_block = new_blocks[n]; | ||
805 | /* | ||
806 | * End of chain, update the last new metablock of | ||
807 | * the chain to point to the new allocated | ||
808 | * data blocks numbers | ||
809 | */ | ||
810 | for (i = 1; i < num; i++) | ||
811 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
812 | } | ||
813 | BUFFER_TRACE(bh, "marking uptodate"); | ||
814 | set_buffer_uptodate(bh); | ||
815 | unlock_buffer(bh); | ||
816 | |||
817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
818 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
819 | if (err) | ||
820 | goto failed; | ||
821 | } | ||
822 | *blks = num; | ||
823 | return err; | ||
824 | failed: | ||
825 | /* Allocation failed, free what we already allocated */ | ||
826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
827 | for (i = 1; i <= n ; i++) { | ||
828 | /* | ||
829 | * branch[i].bh is newly allocated, so there is no | ||
830 | * need to revoke the block, which is why we don't | ||
831 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
832 | */ | ||
833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
834 | EXT4_FREE_BLOCKS_FORGET); | ||
835 | } | ||
836 | for (i = n+1; i < indirect_blks; i++) | ||
837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
838 | |||
839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
840 | |||
841 | return err; | ||
842 | } | ||
843 | |||
844 | /** | ||
845 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
846 | * @handle: handle for this transaction | ||
847 | * @inode: owner | ||
848 | * @block: (logical) number of block we are adding | ||
849 | * @chain: chain of indirect blocks (with a missing link - see | ||
850 | * ext4_alloc_branch) | ||
851 | * @where: location of missing link | ||
852 | * @num: number of indirect blocks we are adding | ||
853 | * @blks: number of direct blocks we are adding | ||
854 | * | ||
855 | * This function fills the missing link and does all housekeeping needed in | ||
856 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
857 | * chain to new block and return 0. | ||
858 | */ | ||
859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
860 | ext4_lblk_t block, Indirect *where, int num, | ||
861 | int blks) | ||
862 | { | ||
863 | int i; | ||
864 | int err = 0; | ||
865 | ext4_fsblk_t current_block; | ||
866 | |||
867 | /* | ||
868 | * If we're splicing into a [td]indirect block (as opposed to the | ||
869 | * inode) then we need to get write access to the [td]indirect block | ||
870 | * before the splice. | ||
871 | */ | ||
872 | if (where->bh) { | ||
873 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
874 | err = ext4_journal_get_write_access(handle, where->bh); | ||
875 | if (err) | ||
876 | goto err_out; | ||
877 | } | ||
878 | /* That's it */ | ||
879 | |||
880 | *where->p = where->key; | ||
881 | |||
882 | /* | ||
883 | * Update the host buffer_head or inode to point to more just allocated | ||
884 | * direct blocks blocks | ||
885 | */ | ||
886 | if (num == 0 && blks > 1) { | ||
887 | current_block = le32_to_cpu(where->key) + 1; | ||
888 | for (i = 1; i < blks; i++) | ||
889 | *(where->p + i) = cpu_to_le32(current_block++); | ||
890 | } | ||
891 | |||
892 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
893 | /* had we spliced it onto indirect block? */ | ||
894 | if (where->bh) { | ||
895 | /* | ||
896 | * If we spliced it onto an indirect block, we haven't | ||
897 | * altered the inode. Note however that if it is being spliced | ||
898 | * onto an indirect block at the very end of the file (the | ||
899 | * file is growing) then we *will* alter the inode to reflect | ||
900 | * the new i_size. But that is not done here - it is done in | ||
901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
902 | */ | ||
903 | jbd_debug(5, "splicing indirect only\n"); | ||
904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
906 | if (err) | ||
907 | goto err_out; | ||
908 | } else { | ||
909 | /* | ||
910 | * OK, we spliced it into the inode itself on a direct block. | ||
911 | */ | ||
912 | ext4_mark_inode_dirty(handle, inode); | ||
913 | jbd_debug(5, "splicing direct\n"); | ||
914 | } | ||
915 | return err; | ||
916 | |||
917 | err_out: | ||
918 | for (i = 1; i <= num; i++) { | ||
919 | /* | ||
920 | * branch[i].bh is newly allocated, so there is no | ||
921 | * need to revoke the block, which is why we don't | ||
922 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
923 | */ | ||
924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
925 | EXT4_FREE_BLOCKS_FORGET); | ||
926 | } | ||
927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
928 | blks, 0); | ||
929 | |||
930 | return err; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
935 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
936 | * scheme) for ext4_map_blocks(). | ||
937 | * | ||
938 | * Allocation strategy is simple: if we have to allocate something, we will | ||
939 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
940 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
941 | * required, recheck the path, free and repeat if check fails, otherwise | ||
942 | * set the last missing link (that will protect us from any truncate-generated | ||
943 | * removals - all blocks on the path are immune now) and possibly force the | ||
944 | * write on the parent block. | ||
945 | * That has a nice additional property: no special recovery from the failed | ||
946 | * allocations is needed - we simply release blocks and do not touch anything | ||
947 | * reachable from inode. | ||
948 | * | ||
949 | * `handle' can be NULL if create == 0. | ||
950 | * | ||
951 | * return > 0, # of blocks mapped or allocated. | ||
952 | * return = 0, if plain lookup failed. | ||
953 | * return < 0, error case. | ||
954 | * | ||
955 | * The ext4_ind_get_blocks() function should be called with | ||
956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
959 | * blocks. | ||
960 | */ | ||
961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
962 | struct ext4_map_blocks *map, | ||
963 | int flags) | ||
964 | { | ||
965 | int err = -EIO; | ||
966 | ext4_lblk_t offsets[4]; | ||
967 | Indirect chain[4]; | ||
968 | Indirect *partial; | ||
969 | ext4_fsblk_t goal; | ||
970 | int indirect_blks; | ||
971 | int blocks_to_boundary = 0; | ||
972 | int depth; | ||
973 | int count = 0; | ||
974 | ext4_fsblk_t first_block = 0; | ||
975 | |||
976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
980 | &blocks_to_boundary); | ||
981 | |||
982 | if (depth == 0) | ||
983 | goto out; | ||
984 | |||
985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
986 | |||
987 | /* Simplest case - block found, no allocation needed */ | ||
988 | if (!partial) { | ||
989 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
990 | count++; | ||
991 | /*map more blocks*/ | ||
992 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
993 | ext4_fsblk_t blk; | ||
994 | |||
995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
996 | |||
997 | if (blk == first_block + count) | ||
998 | count++; | ||
999 | else | ||
1000 | break; | ||
1001 | } | ||
1002 | goto got_it; | ||
1003 | } | ||
1004 | |||
1005 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
1007 | goto cleanup; | ||
1008 | |||
1009 | /* | ||
1010 | * Okay, we need to do block allocation. | ||
1011 | */ | ||
1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
1013 | |||
1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
1015 | indirect_blks = (chain + depth) - partial - 1; | ||
1016 | |||
1017 | /* | ||
1018 | * Next look up the indirect map to count the totoal number of | ||
1019 | * direct blocks to allocate for this branch. | ||
1020 | */ | ||
1021 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
1022 | map->m_len, blocks_to_boundary); | ||
1023 | /* | ||
1024 | * Block out ext4_truncate while we alter the tree | ||
1025 | */ | ||
1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
1027 | &count, goal, | ||
1028 | offsets + (partial - chain), partial); | ||
1029 | |||
1030 | /* | ||
1031 | * The ext4_splice_branch call will free and forget any buffers | ||
1032 | * on the new chain if there is a failure, but that risks using | ||
1033 | * up transaction credits, especially for bitmaps where the | ||
1034 | * credits cannot be returned. Can we handle this somehow? We | ||
1035 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
1036 | */ | ||
1037 | if (!err) | ||
1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
1039 | partial, indirect_blks, count); | ||
1040 | if (err) | ||
1041 | goto cleanup; | ||
1042 | |||
1043 | map->m_flags |= EXT4_MAP_NEW; | ||
1044 | |||
1045 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1046 | got_it: | ||
1047 | map->m_flags |= EXT4_MAP_MAPPED; | ||
1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
1049 | map->m_len = count; | ||
1050 | if (count > blocks_to_boundary) | ||
1051 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
1052 | err = count; | ||
1053 | /* Clean up and exit */ | ||
1054 | partial = chain + depth - 1; /* the whole chain */ | ||
1055 | cleanup: | ||
1056 | while (partial > chain) { | ||
1057 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1058 | brelse(partial->bh); | ||
1059 | partial--; | ||
1060 | } | ||
1061 | out: | ||
1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
1063 | map->m_pblk, map->m_len, err); | ||
1064 | return err; | ||
1065 | } | ||
1066 | |||
1067 | #ifdef CONFIG_QUOTA | 241 | #ifdef CONFIG_QUOTA |
1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) | 242 | qsize_t *ext4_get_reserved_space(struct inode *inode) |
1069 | { | 243 | { |
@@ -1073,33 +247,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) | |||
1073 | 247 | ||
1074 | /* | 248 | /* |
1075 | * Calculate the number of metadata blocks need to reserve | 249 | * Calculate the number of metadata blocks need to reserve |
1076 | * to allocate a new block at @lblocks for non extent file based file | ||
1077 | */ | ||
1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, | ||
1079 | sector_t lblock) | ||
1080 | { | ||
1081 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
1083 | int blk_bits; | ||
1084 | |||
1085 | if (lblock < EXT4_NDIR_BLOCKS) | ||
1086 | return 0; | ||
1087 | |||
1088 | lblock -= EXT4_NDIR_BLOCKS; | ||
1089 | |||
1090 | if (ei->i_da_metadata_calc_len && | ||
1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
1092 | ei->i_da_metadata_calc_len++; | ||
1093 | return 0; | ||
1094 | } | ||
1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
1096 | ei->i_da_metadata_calc_len = 1; | ||
1097 | blk_bits = order_base_2(lblock); | ||
1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Calculate the number of metadata blocks need to reserve | ||
1103 | * to allocate a block located at @lblock | 250 | * to allocate a block located at @lblock |
1104 | */ | 251 | */ |
1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | 252 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
@@ -1107,7 +254,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | |||
1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 254 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1108 | return ext4_ext_calc_metadata_amount(inode, lblock); | 255 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1109 | 256 | ||
1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); | 257 | return ext4_ind_calc_metadata_amount(inode, lblock); |
1111 | } | 258 | } |
1112 | 259 | ||
1113 | /* | 260 | /* |
@@ -1500,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, | |||
1500 | return bh; | 647 | return bh; |
1501 | if (buffer_uptodate(bh)) | 648 | if (buffer_uptodate(bh)) |
1502 | return bh; | 649 | return bh; |
1503 | ll_rw_block(READ_META, 1, &bh); | 650 | ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); |
1504 | wait_on_buffer(bh); | 651 | wait_on_buffer(bh); |
1505 | if (buffer_uptodate(bh)) | 652 | if (buffer_uptodate(bh)) |
1506 | return bh; | 653 | return bh; |
@@ -1589,16 +736,6 @@ static int do_journal_get_write_access(handle_t *handle, | |||
1589 | return ret; | 736 | return ret; |
1590 | } | 737 | } |
1591 | 738 | ||
1592 | /* | ||
1593 | * Truncate blocks that were not used by write. We have to truncate the | ||
1594 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
1595 | */ | ||
1596 | static void ext4_truncate_failed_write(struct inode *inode) | ||
1597 | { | ||
1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1599 | ext4_truncate(inode); | ||
1600 | } | ||
1601 | |||
1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 739 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
1603 | struct buffer_head *bh_result, int create); | 740 | struct buffer_head *bh_result, int create); |
1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 741 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
@@ -1849,6 +986,8 @@ static int ext4_journalled_write_end(struct file *file, | |||
1849 | from = pos & (PAGE_CACHE_SIZE - 1); | 986 | from = pos & (PAGE_CACHE_SIZE - 1); |
1850 | to = from + len; | 987 | to = from + len; |
1851 | 988 | ||
989 | BUG_ON(!ext4_handle_valid(handle)); | ||
990 | |||
1852 | if (copied < len) { | 991 | if (copied < len) { |
1853 | if (!PageUptodate(page)) | 992 | if (!PageUptodate(page)) |
1854 | copied = 0; | 993 | copied = 0; |
@@ -1863,6 +1002,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1863 | if (new_i_size > inode->i_size) | 1002 | if (new_i_size > inode->i_size) |
1864 | i_size_write(inode, pos+copied); | 1003 | i_size_write(inode, pos+copied); |
1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 1004 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1005 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1006 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1867 | ext4_update_i_disksize(inode, new_i_size); | 1007 | ext4_update_i_disksize(inode, new_i_size); |
1868 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1008 | ret2 = ext4_mark_inode_dirty(handle, inode); |
@@ -2121,8 +1261,11 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
2121 | clear_buffer_unwritten(bh); | 1261 | clear_buffer_unwritten(bh); |
2122 | } | 1262 | } |
2123 | 1263 | ||
2124 | /* skip page if block allocation undone */ | 1264 | /* |
2125 | if (buffer_delay(bh) || buffer_unwritten(bh)) | 1265 | * skip page if block allocation undone and |
1266 | * block is dirty | ||
1267 | */ | ||
1268 | if (ext4_bh_delay_or_unwritten(NULL, bh)) | ||
2126 | skip_page = 1; | 1269 | skip_page = 1; |
2127 | bh = bh->b_this_page; | 1270 | bh = bh->b_this_page; |
2128 | block_start += bh->b_size; | 1271 | block_start += bh->b_size; |
@@ -2148,7 +1291,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
2148 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) | 1291 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) |
2149 | err = ext4_bio_write_page(&io_submit, page, | 1292 | err = ext4_bio_write_page(&io_submit, page, |
2150 | len, mpd->wbc); | 1293 | len, mpd->wbc); |
2151 | else | 1294 | else if (buffer_uninit(page_bufs)) { |
1295 | ext4_set_bh_endio(page_bufs, inode); | ||
1296 | err = block_write_full_page_endio(page, | ||
1297 | noalloc_get_block_write, | ||
1298 | mpd->wbc, ext4_end_io_buffer_write); | ||
1299 | } else | ||
2152 | err = block_write_full_page(page, | 1300 | err = block_write_full_page(page, |
2153 | noalloc_get_block_write, mpd->wbc); | 1301 | noalloc_get_block_write, mpd->wbc); |
2154 | 1302 | ||
@@ -2564,6 +1712,8 @@ static int __ext4_journalled_writepage(struct page *page, | |||
2564 | goto out; | 1712 | goto out; |
2565 | } | 1713 | } |
2566 | 1714 | ||
1715 | BUG_ON(!ext4_handle_valid(handle)); | ||
1716 | |||
2567 | ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, | 1717 | ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, |
2568 | do_journal_get_write_access); | 1718 | do_journal_get_write_access); |
2569 | 1719 | ||
@@ -2571,6 +1721,7 @@ static int __ext4_journalled_writepage(struct page *page, | |||
2571 | write_end_fn); | 1721 | write_end_fn); |
2572 | if (ret == 0) | 1722 | if (ret == 0) |
2573 | ret = err; | 1723 | ret = err; |
1724 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
2574 | err = ext4_journal_stop(handle); | 1725 | err = ext4_journal_stop(handle); |
2575 | if (!ret) | 1726 | if (!ret) |
2576 | ret = err; | 1727 | ret = err; |
@@ -2741,7 +1892,7 @@ static int write_cache_pages_da(struct address_space *mapping, | |||
2741 | index = wbc->range_start >> PAGE_CACHE_SHIFT; | 1892 | index = wbc->range_start >> PAGE_CACHE_SHIFT; |
2742 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 1893 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
2743 | 1894 | ||
2744 | if (wbc->sync_mode == WB_SYNC_ALL) | 1895 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2745 | tag = PAGECACHE_TAG_TOWRITE; | 1896 | tag = PAGECACHE_TAG_TOWRITE; |
2746 | else | 1897 | else |
2747 | tag = PAGECACHE_TAG_DIRTY; | 1898 | tag = PAGECACHE_TAG_DIRTY; |
@@ -2973,7 +2124,7 @@ static int ext4_da_writepages(struct address_space *mapping, | |||
2973 | } | 2124 | } |
2974 | 2125 | ||
2975 | retry: | 2126 | retry: |
2976 | if (wbc->sync_mode == WB_SYNC_ALL) | 2127 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
2977 | tag_pages_for_writeback(mapping, index, end); | 2128 | tag_pages_for_writeback(mapping, index, end); |
2978 | 2129 | ||
2979 | while (!ret && wbc->nr_to_write > 0) { | 2130 | while (!ret && wbc->nr_to_write > 0) { |
@@ -3219,7 +2370,7 @@ static int ext4_da_write_end(struct file *file, | |||
3219 | */ | 2370 | */ |
3220 | 2371 | ||
3221 | new_i_size = pos + copied; | 2372 | new_i_size = pos + copied; |
3222 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 2373 | if (copied && new_i_size > EXT4_I(inode)->i_disksize) { |
3223 | if (ext4_da_should_update_i_disksize(page, end)) { | 2374 | if (ext4_da_should_update_i_disksize(page, end)) { |
3224 | down_write(&EXT4_I(inode)->i_data_sem); | 2375 | down_write(&EXT4_I(inode)->i_data_sem); |
3225 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 2376 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
@@ -3450,114 +2601,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3450 | } | 2601 | } |
3451 | 2602 | ||
3452 | /* | 2603 | /* |
3453 | * O_DIRECT for ext3 (or indirect map) based files | ||
3454 | * | ||
3455 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3456 | * orphan list. So recovery will truncate it back to the original size | ||
3457 | * if the machine crashes during the write. | ||
3458 | * | ||
3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
3460 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
3461 | * VFS code falls back into buffered path in that case so we are safe. | ||
3462 | */ | ||
3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
3464 | const struct iovec *iov, loff_t offset, | ||
3465 | unsigned long nr_segs) | ||
3466 | { | ||
3467 | struct file *file = iocb->ki_filp; | ||
3468 | struct inode *inode = file->f_mapping->host; | ||
3469 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3470 | handle_t *handle; | ||
3471 | ssize_t ret; | ||
3472 | int orphan = 0; | ||
3473 | size_t count = iov_length(iov, nr_segs); | ||
3474 | int retries = 0; | ||
3475 | |||
3476 | if (rw == WRITE) { | ||
3477 | loff_t final_size = offset + count; | ||
3478 | |||
3479 | if (final_size > inode->i_size) { | ||
3480 | /* Credits for sb + inode write */ | ||
3481 | handle = ext4_journal_start(inode, 2); | ||
3482 | if (IS_ERR(handle)) { | ||
3483 | ret = PTR_ERR(handle); | ||
3484 | goto out; | ||
3485 | } | ||
3486 | ret = ext4_orphan_add(handle, inode); | ||
3487 | if (ret) { | ||
3488 | ext4_journal_stop(handle); | ||
3489 | goto out; | ||
3490 | } | ||
3491 | orphan = 1; | ||
3492 | ei->i_disksize = inode->i_size; | ||
3493 | ext4_journal_stop(handle); | ||
3494 | } | ||
3495 | } | ||
3496 | |||
3497 | retry: | ||
3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
3499 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
3500 | inode->i_sb->s_bdev, iov, | ||
3501 | offset, nr_segs, | ||
3502 | ext4_get_block, NULL, NULL, 0); | ||
3503 | else { | ||
3504 | ret = blockdev_direct_IO(rw, iocb, inode, | ||
3505 | inode->i_sb->s_bdev, iov, | ||
3506 | offset, nr_segs, | ||
3507 | ext4_get_block, NULL); | ||
3508 | |||
3509 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
3510 | loff_t isize = i_size_read(inode); | ||
3511 | loff_t end = offset + iov_length(iov, nr_segs); | ||
3512 | |||
3513 | if (end > isize) | ||
3514 | ext4_truncate_failed_write(inode); | ||
3515 | } | ||
3516 | } | ||
3517 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3518 | goto retry; | ||
3519 | |||
3520 | if (orphan) { | ||
3521 | int err; | ||
3522 | |||
3523 | /* Credits for sb + inode write */ | ||
3524 | handle = ext4_journal_start(inode, 2); | ||
3525 | if (IS_ERR(handle)) { | ||
3526 | /* This is really bad luck. We've written the data | ||
3527 | * but cannot extend i_size. Bail out and pretend | ||
3528 | * the write failed... */ | ||
3529 | ret = PTR_ERR(handle); | ||
3530 | if (inode->i_nlink) | ||
3531 | ext4_orphan_del(NULL, inode); | ||
3532 | |||
3533 | goto out; | ||
3534 | } | ||
3535 | if (inode->i_nlink) | ||
3536 | ext4_orphan_del(handle, inode); | ||
3537 | if (ret > 0) { | ||
3538 | loff_t end = offset + ret; | ||
3539 | if (end > inode->i_size) { | ||
3540 | ei->i_disksize = end; | ||
3541 | i_size_write(inode, end); | ||
3542 | /* | ||
3543 | * We're going to return a positive `ret' | ||
3544 | * here due to non-zero-length I/O, so there's | ||
3545 | * no way of reporting error returns from | ||
3546 | * ext4_mark_inode_dirty() to userspace. So | ||
3547 | * ignore it. | ||
3548 | */ | ||
3549 | ext4_mark_inode_dirty(handle, inode); | ||
3550 | } | ||
3551 | } | ||
3552 | err = ext4_journal_stop(handle); | ||
3553 | if (ret == 0) | ||
3554 | ret = err; | ||
3555 | } | ||
3556 | out: | ||
3557 | return ret; | ||
3558 | } | ||
3559 | |||
3560 | /* | ||
3561 | * ext4_get_block used when preparing for a DIO write or buffer write. | 2604 | * ext4_get_block used when preparing for a DIO write or buffer write. |
3562 | * We allocate an uinitialized extent if blocks haven't been allocated. | 2605 | * We allocate an uinitialized extent if blocks haven't been allocated. |
3563 | * The extent will be converted to initialized after the IO is complete. | 2606 | * The extent will be converted to initialized after the IO is complete. |
@@ -3575,6 +2618,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3575 | ssize_t size, void *private, int ret, | 2618 | ssize_t size, void *private, int ret, |
3576 | bool is_async) | 2619 | bool is_async) |
3577 | { | 2620 | { |
2621 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | ||
3578 | ext4_io_end_t *io_end = iocb->private; | 2622 | ext4_io_end_t *io_end = iocb->private; |
3579 | struct workqueue_struct *wq; | 2623 | struct workqueue_struct *wq; |
3580 | unsigned long flags; | 2624 | unsigned long flags; |
@@ -3589,13 +2633,15 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
3589 | iocb->private, io_end->inode->i_ino, iocb, offset, | 2633 | iocb->private, io_end->inode->i_ino, iocb, offset, |
3590 | size); | 2634 | size); |
3591 | 2635 | ||
2636 | iocb->private = NULL; | ||
2637 | |||
3592 | /* if not aio dio with unwritten extents, just free io and return */ | 2638 | /* if not aio dio with unwritten extents, just free io and return */ |
3593 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | 2639 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { |
3594 | ext4_free_io_end(io_end); | 2640 | ext4_free_io_end(io_end); |
3595 | iocb->private = NULL; | ||
3596 | out: | 2641 | out: |
3597 | if (is_async) | 2642 | if (is_async) |
3598 | aio_complete(iocb, ret, 0); | 2643 | aio_complete(iocb, ret, 0); |
2644 | inode_dio_done(inode); | ||
3599 | return; | 2645 | return; |
3600 | } | 2646 | } |
3601 | 2647 | ||
@@ -3615,7 +2661,9 @@ out: | |||
3615 | 2661 | ||
3616 | /* queue the work to convert unwritten extents to written */ | 2662 | /* queue the work to convert unwritten extents to written */ |
3617 | queue_work(wq, &io_end->work); | 2663 | queue_work(wq, &io_end->work); |
3618 | iocb->private = NULL; | 2664 | |
2665 | /* XXX: probably should move into the real I/O completion handler */ | ||
2666 | inode_dio_done(inode); | ||
3619 | } | 2667 | } |
3620 | 2668 | ||
3621 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | 2669 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) |
@@ -3635,8 +2683,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | |||
3635 | goto out; | 2683 | goto out; |
3636 | } | 2684 | } |
3637 | 2685 | ||
3638 | io_end->flag = EXT4_IO_END_UNWRITTEN; | 2686 | /* |
2687 | * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now, | ||
2688 | * but being more careful is always safe for the future change. | ||
2689 | */ | ||
3639 | inode = io_end->inode; | 2690 | inode = io_end->inode; |
2691 | if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { | ||
2692 | io_end->flag |= EXT4_IO_END_UNWRITTEN; | ||
2693 | atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); | ||
2694 | } | ||
3640 | 2695 | ||
3641 | /* Add the io_end to per-inode completed io list*/ | 2696 | /* Add the io_end to per-inode completed io list*/ |
3642 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | 2697 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); |
@@ -3748,11 +2803,13 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, | |||
3748 | EXT4_I(inode)->cur_aio_dio = iocb->private; | 2803 | EXT4_I(inode)->cur_aio_dio = iocb->private; |
3749 | } | 2804 | } |
3750 | 2805 | ||
3751 | ret = blockdev_direct_IO(rw, iocb, inode, | 2806 | ret = __blockdev_direct_IO(rw, iocb, inode, |
3752 | inode->i_sb->s_bdev, iov, | 2807 | inode->i_sb->s_bdev, iov, |
3753 | offset, nr_segs, | 2808 | offset, nr_segs, |
3754 | ext4_get_block_write, | 2809 | ext4_get_block_write, |
3755 | ext4_end_io_dio); | 2810 | ext4_end_io_dio, |
2811 | NULL, | ||
2812 | DIO_LOCKING | DIO_SKIP_HOLES); | ||
3756 | if (iocb->private) | 2813 | if (iocb->private) |
3757 | EXT4_I(inode)->cur_aio_dio = NULL; | 2814 | EXT4_I(inode)->cur_aio_dio = NULL; |
3758 | /* | 2815 | /* |
@@ -4028,383 +3085,6 @@ unlock: | |||
4028 | return err; | 3085 | return err; |
4029 | } | 3086 | } |
4030 | 3087 | ||
4031 | /* | ||
4032 | * Probably it should be a library function... search for first non-zero word | ||
4033 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
4034 | * Linus? | ||
4035 | */ | ||
4036 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
4037 | { | ||
4038 | while (p < q) | ||
4039 | if (*p++) | ||
4040 | return 0; | ||
4041 | return 1; | ||
4042 | } | ||
4043 | |||
4044 | /** | ||
4045 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
4046 | * @inode: inode in question | ||
4047 | * @depth: depth of the affected branch | ||
4048 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
4049 | * @chain: place to store the pointers to partial indirect blocks | ||
4050 | * @top: place to the (detached) top of branch | ||
4051 | * | ||
4052 | * This is a helper function used by ext4_truncate(). | ||
4053 | * | ||
4054 | * When we do truncate() we may have to clean the ends of several | ||
4055 | * indirect blocks but leave the blocks themselves alive. Block is | ||
4056 | * partially truncated if some data below the new i_size is referred | ||
4057 | * from it (and it is on the path to the first completely truncated | ||
4058 | * data block, indeed). We have to free the top of that path along | ||
4059 | * with everything to the right of the path. Since no allocation | ||
4060 | * past the truncation point is possible until ext4_truncate() | ||
4061 | * finishes, we may safely do the latter, but top of branch may | ||
4062 | * require special attention - pageout below the truncation point | ||
4063 | * might try to populate it. | ||
4064 | * | ||
4065 | * We atomically detach the top of branch from the tree, store the | ||
4066 | * block number of its root in *@top, pointers to buffer_heads of | ||
4067 | * partially truncated blocks - in @chain[].bh and pointers to | ||
4068 | * their last elements that should not be removed - in | ||
4069 | * @chain[].p. Return value is the pointer to last filled element | ||
4070 | * of @chain. | ||
4071 | * | ||
4072 | * The work left to caller to do the actual freeing of subtrees: | ||
4073 | * a) free the subtree starting from *@top | ||
4074 | * b) free the subtrees whose roots are stored in | ||
4075 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
4076 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
4077 | * (no partially truncated stuff there). */ | ||
4078 | |||
4079 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
4080 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
4081 | __le32 *top) | ||
4082 | { | ||
4083 | Indirect *partial, *p; | ||
4084 | int k, err; | ||
4085 | |||
4086 | *top = 0; | ||
4087 | /* Make k index the deepest non-null offset + 1 */ | ||
4088 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
4089 | ; | ||
4090 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
4091 | /* Writer: pointers */ | ||
4092 | if (!partial) | ||
4093 | partial = chain + k-1; | ||
4094 | /* | ||
4095 | * If the branch acquired continuation since we've looked at it - | ||
4096 | * fine, it should all survive and (new) top doesn't belong to us. | ||
4097 | */ | ||
4098 | if (!partial->key && *partial->p) | ||
4099 | /* Writer: end */ | ||
4100 | goto no_top; | ||
4101 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
4102 | ; | ||
4103 | /* | ||
4104 | * OK, we've found the last block that must survive. The rest of our | ||
4105 | * branch should be detached before unlocking. However, if that rest | ||
4106 | * of branch is all ours and does not grow immediately from the inode | ||
4107 | * it's easier to cheat and just decrement partial->p. | ||
4108 | */ | ||
4109 | if (p == chain + k - 1 && p > chain) { | ||
4110 | p->p--; | ||
4111 | } else { | ||
4112 | *top = *p->p; | ||
4113 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
4114 | #if 0 | ||
4115 | *p->p = 0; | ||
4116 | #endif | ||
4117 | } | ||
4118 | /* Writer: end */ | ||
4119 | |||
4120 | while (partial > p) { | ||
4121 | brelse(partial->bh); | ||
4122 | partial--; | ||
4123 | } | ||
4124 | no_top: | ||
4125 | return partial; | ||
4126 | } | ||
4127 | |||
4128 | /* | ||
4129 | * Zero a number of block pointers in either an inode or an indirect block. | ||
4130 | * If we restart the transaction we must again get write access to the | ||
4131 | * indirect block for further modification. | ||
4132 | * | ||
4133 | * We release `count' blocks on disk, but (last - first) may be greater | ||
4134 | * than `count' because there can be holes in there. | ||
4135 | * | ||
4136 | * Return 0 on success, 1 on invalid block range | ||
4137 | * and < 0 on fatal error. | ||
4138 | */ | ||
4139 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
4140 | struct buffer_head *bh, | ||
4141 | ext4_fsblk_t block_to_free, | ||
4142 | unsigned long count, __le32 *first, | ||
4143 | __le32 *last) | ||
4144 | { | ||
4145 | __le32 *p; | ||
4146 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
4147 | int err; | ||
4148 | |||
4149 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
4150 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4151 | |||
4152 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
4153 | count)) { | ||
4154 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
4155 | "blocks %llu len %lu", | ||
4156 | (unsigned long long) block_to_free, count); | ||
4157 | return 1; | ||
4158 | } | ||
4159 | |||
4160 | if (try_to_extend_transaction(handle, inode)) { | ||
4161 | if (bh) { | ||
4162 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
4163 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
4164 | if (unlikely(err)) | ||
4165 | goto out_err; | ||
4166 | } | ||
4167 | err = ext4_mark_inode_dirty(handle, inode); | ||
4168 | if (unlikely(err)) | ||
4169 | goto out_err; | ||
4170 | err = ext4_truncate_restart_trans(handle, inode, | ||
4171 | blocks_for_truncate(inode)); | ||
4172 | if (unlikely(err)) | ||
4173 | goto out_err; | ||
4174 | if (bh) { | ||
4175 | BUFFER_TRACE(bh, "retaking write access"); | ||
4176 | err = ext4_journal_get_write_access(handle, bh); | ||
4177 | if (unlikely(err)) | ||
4178 | goto out_err; | ||
4179 | } | ||
4180 | } | ||
4181 | |||
4182 | for (p = first; p < last; p++) | ||
4183 | *p = 0; | ||
4184 | |||
4185 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
4186 | return 0; | ||
4187 | out_err: | ||
4188 | ext4_std_error(inode->i_sb, err); | ||
4189 | return err; | ||
4190 | } | ||
4191 | |||
4192 | /** | ||
4193 | * ext4_free_data - free a list of data blocks | ||
4194 | * @handle: handle for this transaction | ||
4195 | * @inode: inode we are dealing with | ||
4196 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
4197 | * @first: array of block numbers | ||
4198 | * @last: points immediately past the end of array | ||
4199 | * | ||
4200 | * We are freeing all blocks referred from that array (numbers are stored as | ||
4201 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
4202 | * | ||
4203 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
4204 | * blocks are contiguous then releasing them at one time will only affect one | ||
4205 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
4206 | * actually use a lot of journal space. | ||
4207 | * | ||
4208 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
4209 | * block pointers. | ||
4210 | */ | ||
4211 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
4212 | struct buffer_head *this_bh, | ||
4213 | __le32 *first, __le32 *last) | ||
4214 | { | ||
4215 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
4216 | unsigned long count = 0; /* Number of blocks in the run */ | ||
4217 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
4218 | corresponding to | ||
4219 | block_to_free */ | ||
4220 | ext4_fsblk_t nr; /* Current block # */ | ||
4221 | __le32 *p; /* Pointer into inode/ind | ||
4222 | for current block */ | ||
4223 | int err = 0; | ||
4224 | |||
4225 | if (this_bh) { /* For indirect block */ | ||
4226 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
4227 | err = ext4_journal_get_write_access(handle, this_bh); | ||
4228 | /* Important: if we can't update the indirect pointers | ||
4229 | * to the blocks, we can't free them. */ | ||
4230 | if (err) | ||
4231 | return; | ||
4232 | } | ||
4233 | |||
4234 | for (p = first; p < last; p++) { | ||
4235 | nr = le32_to_cpu(*p); | ||
4236 | if (nr) { | ||
4237 | /* accumulate blocks to free if they're contiguous */ | ||
4238 | if (count == 0) { | ||
4239 | block_to_free = nr; | ||
4240 | block_to_free_p = p; | ||
4241 | count = 1; | ||
4242 | } else if (nr == block_to_free + count) { | ||
4243 | count++; | ||
4244 | } else { | ||
4245 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
4246 | block_to_free, count, | ||
4247 | block_to_free_p, p); | ||
4248 | if (err) | ||
4249 | break; | ||
4250 | block_to_free = nr; | ||
4251 | block_to_free_p = p; | ||
4252 | count = 1; | ||
4253 | } | ||
4254 | } | ||
4255 | } | ||
4256 | |||
4257 | if (!err && count > 0) | ||
4258 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
4259 | count, block_to_free_p, p); | ||
4260 | if (err < 0) | ||
4261 | /* fatal error */ | ||
4262 | return; | ||
4263 | |||
4264 | if (this_bh) { | ||
4265 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
4266 | |||
4267 | /* | ||
4268 | * The buffer head should have an attached journal head at this | ||
4269 | * point. However, if the data is corrupted and an indirect | ||
4270 | * block pointed to itself, it would have been detached when | ||
4271 | * the block was cleared. Check for this instead of OOPSing. | ||
4272 | */ | ||
4273 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
4274 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
4275 | else | ||
4276 | EXT4_ERROR_INODE(inode, | ||
4277 | "circular indirect block detected at " | ||
4278 | "block %llu", | ||
4279 | (unsigned long long) this_bh->b_blocknr); | ||
4280 | } | ||
4281 | } | ||
4282 | |||
4283 | /** | ||
4284 | * ext4_free_branches - free an array of branches | ||
4285 | * @handle: JBD handle for this transaction | ||
4286 | * @inode: inode we are dealing with | ||
4287 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
4288 | * @first: array of block numbers | ||
4289 | * @last: pointer immediately past the end of array | ||
4290 | * @depth: depth of the branches to free | ||
4291 | * | ||
4292 | * We are freeing all blocks referred from these branches (numbers are | ||
4293 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
4294 | * appropriately. | ||
4295 | */ | ||
4296 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
4297 | struct buffer_head *parent_bh, | ||
4298 | __le32 *first, __le32 *last, int depth) | ||
4299 | { | ||
4300 | ext4_fsblk_t nr; | ||
4301 | __le32 *p; | ||
4302 | |||
4303 | if (ext4_handle_is_aborted(handle)) | ||
4304 | return; | ||
4305 | |||
4306 | if (depth--) { | ||
4307 | struct buffer_head *bh; | ||
4308 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4309 | p = last; | ||
4310 | while (--p >= first) { | ||
4311 | nr = le32_to_cpu(*p); | ||
4312 | if (!nr) | ||
4313 | continue; /* A hole */ | ||
4314 | |||
4315 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
4316 | nr, 1)) { | ||
4317 | EXT4_ERROR_INODE(inode, | ||
4318 | "invalid indirect mapped " | ||
4319 | "block %lu (level %d)", | ||
4320 | (unsigned long) nr, depth); | ||
4321 | break; | ||
4322 | } | ||
4323 | |||
4324 | /* Go read the buffer for the next level down */ | ||
4325 | bh = sb_bread(inode->i_sb, nr); | ||
4326 | |||
4327 | /* | ||
4328 | * A read failure? Report error and clear slot | ||
4329 | * (should be rare). | ||
4330 | */ | ||
4331 | if (!bh) { | ||
4332 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
4333 | "Read failure"); | ||
4334 | continue; | ||
4335 | } | ||
4336 | |||
4337 | /* This zaps the entire block. Bottom up. */ | ||
4338 | BUFFER_TRACE(bh, "free child branches"); | ||
4339 | ext4_free_branches(handle, inode, bh, | ||
4340 | (__le32 *) bh->b_data, | ||
4341 | (__le32 *) bh->b_data + addr_per_block, | ||
4342 | depth); | ||
4343 | brelse(bh); | ||
4344 | |||
4345 | /* | ||
4346 | * Everything below this this pointer has been | ||
4347 | * released. Now let this top-of-subtree go. | ||
4348 | * | ||
4349 | * We want the freeing of this indirect block to be | ||
4350 | * atomic in the journal with the updating of the | ||
4351 | * bitmap block which owns it. So make some room in | ||
4352 | * the journal. | ||
4353 | * | ||
4354 | * We zero the parent pointer *after* freeing its | ||
4355 | * pointee in the bitmaps, so if extend_transaction() | ||
4356 | * for some reason fails to put the bitmap changes and | ||
4357 | * the release into the same transaction, recovery | ||
4358 | * will merely complain about releasing a free block, | ||
4359 | * rather than leaking blocks. | ||
4360 | */ | ||
4361 | if (ext4_handle_is_aborted(handle)) | ||
4362 | return; | ||
4363 | if (try_to_extend_transaction(handle, inode)) { | ||
4364 | ext4_mark_inode_dirty(handle, inode); | ||
4365 | ext4_truncate_restart_trans(handle, inode, | ||
4366 | blocks_for_truncate(inode)); | ||
4367 | } | ||
4368 | |||
4369 | /* | ||
4370 | * The forget flag here is critical because if | ||
4371 | * we are journaling (and not doing data | ||
4372 | * journaling), we have to make sure a revoke | ||
4373 | * record is written to prevent the journal | ||
4374 | * replay from overwriting the (former) | ||
4375 | * indirect block if it gets reallocated as a | ||
4376 | * data block. This must happen in the same | ||
4377 | * transaction where the data blocks are | ||
4378 | * actually freed. | ||
4379 | */ | ||
4380 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
4381 | EXT4_FREE_BLOCKS_METADATA| | ||
4382 | EXT4_FREE_BLOCKS_FORGET); | ||
4383 | |||
4384 | if (parent_bh) { | ||
4385 | /* | ||
4386 | * The block which we have just freed is | ||
4387 | * pointed to by an indirect block: journal it | ||
4388 | */ | ||
4389 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
4390 | if (!ext4_journal_get_write_access(handle, | ||
4391 | parent_bh)){ | ||
4392 | *p = 0; | ||
4393 | BUFFER_TRACE(parent_bh, | ||
4394 | "call ext4_handle_dirty_metadata"); | ||
4395 | ext4_handle_dirty_metadata(handle, | ||
4396 | inode, | ||
4397 | parent_bh); | ||
4398 | } | ||
4399 | } | ||
4400 | } | ||
4401 | } else { | ||
4402 | /* We have reached the bottom of the tree. */ | ||
4403 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
4404 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
4405 | } | ||
4406 | } | ||
4407 | |||
4408 | int ext4_can_truncate(struct inode *inode) | 3088 | int ext4_can_truncate(struct inode *inode) |
4409 | { | 3089 | { |
4410 | if (S_ISREG(inode->i_mode)) | 3090 | if (S_ISREG(inode->i_mode)) |
@@ -4471,19 +3151,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4471 | */ | 3151 | */ |
4472 | void ext4_truncate(struct inode *inode) | 3152 | void ext4_truncate(struct inode *inode) |
4473 | { | 3153 | { |
4474 | handle_t *handle; | ||
4475 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4476 | __le32 *i_data = ei->i_data; | ||
4477 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4478 | struct address_space *mapping = inode->i_mapping; | ||
4479 | ext4_lblk_t offsets[4]; | ||
4480 | Indirect chain[4]; | ||
4481 | Indirect *partial; | ||
4482 | __le32 nr = 0; | ||
4483 | int n = 0; | ||
4484 | ext4_lblk_t last_block, max_block; | ||
4485 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
4486 | |||
4487 | trace_ext4_truncate_enter(inode); | 3154 | trace_ext4_truncate_enter(inode); |
4488 | 3155 | ||
4489 | if (!ext4_can_truncate(inode)) | 3156 | if (!ext4_can_truncate(inode)) |
@@ -4494,149 +3161,11 @@ void ext4_truncate(struct inode *inode) | |||
4494 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | 3161 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4495 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | 3162 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); |
4496 | 3163 | ||
4497 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 3164 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
4498 | ext4_ext_truncate(inode); | 3165 | ext4_ext_truncate(inode); |
4499 | trace_ext4_truncate_exit(inode); | 3166 | else |
4500 | return; | 3167 | ext4_ind_truncate(inode); |
4501 | } | ||
4502 | |||
4503 | handle = start_transaction(inode); | ||
4504 | if (IS_ERR(handle)) | ||
4505 | return; /* AKPM: return what? */ | ||
4506 | |||
4507 | last_block = (inode->i_size + blocksize-1) | ||
4508 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4509 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
4510 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4511 | |||
4512 | if (inode->i_size & (blocksize - 1)) | ||
4513 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
4514 | goto out_stop; | ||
4515 | |||
4516 | if (last_block != max_block) { | ||
4517 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
4518 | if (n == 0) | ||
4519 | goto out_stop; /* error */ | ||
4520 | } | ||
4521 | |||
4522 | /* | ||
4523 | * OK. This truncate is going to happen. We add the inode to the | ||
4524 | * orphan list, so that if this truncate spans multiple transactions, | ||
4525 | * and we crash, we will resume the truncate when the filesystem | ||
4526 | * recovers. It also marks the inode dirty, to catch the new size. | ||
4527 | * | ||
4528 | * Implication: the file must always be in a sane, consistent | ||
4529 | * truncatable state while each transaction commits. | ||
4530 | */ | ||
4531 | if (ext4_orphan_add(handle, inode)) | ||
4532 | goto out_stop; | ||
4533 | |||
4534 | /* | ||
4535 | * From here we block out all ext4_get_block() callers who want to | ||
4536 | * modify the block allocation tree. | ||
4537 | */ | ||
4538 | down_write(&ei->i_data_sem); | ||
4539 | |||
4540 | ext4_discard_preallocations(inode); | ||
4541 | |||
4542 | /* | ||
4543 | * The orphan list entry will now protect us from any crash which | ||
4544 | * occurs before the truncate completes, so it is now safe to propagate | ||
4545 | * the new, shorter inode size (held for now in i_size) into the | ||
4546 | * on-disk inode. We do this via i_disksize, which is the value which | ||
4547 | * ext4 *really* writes onto the disk inode. | ||
4548 | */ | ||
4549 | ei->i_disksize = inode->i_size; | ||
4550 | |||
4551 | if (last_block == max_block) { | ||
4552 | /* | ||
4553 | * It is unnecessary to free any data blocks if last_block is | ||
4554 | * equal to the indirect block limit. | ||
4555 | */ | ||
4556 | goto out_unlock; | ||
4557 | } else if (n == 1) { /* direct blocks */ | ||
4558 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
4559 | i_data + EXT4_NDIR_BLOCKS); | ||
4560 | goto do_indirects; | ||
4561 | } | ||
4562 | |||
4563 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
4564 | /* Kill the top of shared branch (not detached) */ | ||
4565 | if (nr) { | ||
4566 | if (partial == chain) { | ||
4567 | /* Shared branch grows from the inode */ | ||
4568 | ext4_free_branches(handle, inode, NULL, | ||
4569 | &nr, &nr+1, (chain+n-1) - partial); | ||
4570 | *partial->p = 0; | ||
4571 | /* | ||
4572 | * We mark the inode dirty prior to restart, | ||
4573 | * and prior to stop. No need for it here. | ||
4574 | */ | ||
4575 | } else { | ||
4576 | /* Shared branch grows from an indirect block */ | ||
4577 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
4578 | ext4_free_branches(handle, inode, partial->bh, | ||
4579 | partial->p, | ||
4580 | partial->p+1, (chain+n-1) - partial); | ||
4581 | } | ||
4582 | } | ||
4583 | /* Clear the ends of indirect blocks on the shared branch */ | ||
4584 | while (partial > chain) { | ||
4585 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
4586 | (__le32*)partial->bh->b_data+addr_per_block, | ||
4587 | (chain+n-1) - partial); | ||
4588 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
4589 | brelse(partial->bh); | ||
4590 | partial--; | ||
4591 | } | ||
4592 | do_indirects: | ||
4593 | /* Kill the remaining (whole) subtrees */ | ||
4594 | switch (offsets[0]) { | ||
4595 | default: | ||
4596 | nr = i_data[EXT4_IND_BLOCK]; | ||
4597 | if (nr) { | ||
4598 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
4599 | i_data[EXT4_IND_BLOCK] = 0; | ||
4600 | } | ||
4601 | case EXT4_IND_BLOCK: | ||
4602 | nr = i_data[EXT4_DIND_BLOCK]; | ||
4603 | if (nr) { | ||
4604 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
4605 | i_data[EXT4_DIND_BLOCK] = 0; | ||
4606 | } | ||
4607 | case EXT4_DIND_BLOCK: | ||
4608 | nr = i_data[EXT4_TIND_BLOCK]; | ||
4609 | if (nr) { | ||
4610 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
4611 | i_data[EXT4_TIND_BLOCK] = 0; | ||
4612 | } | ||
4613 | case EXT4_TIND_BLOCK: | ||
4614 | ; | ||
4615 | } | ||
4616 | |||
4617 | out_unlock: | ||
4618 | up_write(&ei->i_data_sem); | ||
4619 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4620 | ext4_mark_inode_dirty(handle, inode); | ||
4621 | |||
4622 | /* | ||
4623 | * In a multi-transaction truncate, we only make the final transaction | ||
4624 | * synchronous | ||
4625 | */ | ||
4626 | if (IS_SYNC(inode)) | ||
4627 | ext4_handle_sync(handle); | ||
4628 | out_stop: | ||
4629 | /* | ||
4630 | * If this was a simple ftruncate(), and the file will remain alive | ||
4631 | * then we need to clear up the orphan record which we created above. | ||
4632 | * However, if this was a real unlink then we were called by | ||
4633 | * ext4_delete_inode(), and we allow that function to clean up the | ||
4634 | * orphan info for us. | ||
4635 | */ | ||
4636 | if (inode->i_nlink) | ||
4637 | ext4_orphan_del(handle, inode); | ||
4638 | 3168 | ||
4639 | ext4_journal_stop(handle); | ||
4640 | trace_ext4_truncate_exit(inode); | 3169 | trace_ext4_truncate_exit(inode); |
4641 | } | 3170 | } |
4642 | 3171 | ||
@@ -4772,7 +3301,7 @@ make_io: | |||
4772 | trace_ext4_load_inode(inode); | 3301 | trace_ext4_load_inode(inode); |
4773 | get_bh(bh); | 3302 | get_bh(bh); |
4774 | bh->b_end_io = end_buffer_read_sync; | 3303 | bh->b_end_io = end_buffer_read_sync; |
4775 | submit_bh(READ_META, bh); | 3304 | submit_bh(READ | REQ_META | REQ_PRIO, bh); |
4776 | wait_on_buffer(bh); | 3305 | wait_on_buffer(bh); |
4777 | if (!buffer_uptodate(bh)) { | 3306 | if (!buffer_uptodate(bh)) { |
4778 | EXT4_ERROR_INODE_BLOCK(inode, block, | 3307 | EXT4_ERROR_INODE_BLOCK(inode, block, |
@@ -5007,7 +3536,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
5007 | (S_ISLNK(inode->i_mode) && | 3536 | (S_ISLNK(inode->i_mode) && |
5008 | !ext4_inode_is_fast_symlink(inode))) { | 3537 | !ext4_inode_is_fast_symlink(inode))) { |
5009 | /* Validate block references which are part of inode */ | 3538 | /* Validate block references which are part of inode */ |
5010 | ret = ext4_check_inode_blockref(inode); | 3539 | ret = ext4_ind_check_inode(inode); |
5011 | } | 3540 | } |
5012 | if (ret) | 3541 | if (ret) |
5013 | goto bad_inode; | 3542 | goto bad_inode; |
@@ -5351,6 +3880,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
5351 | } | 3880 | } |
5352 | 3881 | ||
5353 | if (attr->ia_valid & ATTR_SIZE) { | 3882 | if (attr->ia_valid & ATTR_SIZE) { |
3883 | inode_dio_wait(inode); | ||
3884 | |||
5354 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { | 3885 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { |
5355 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | 3886 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); |
5356 | 3887 | ||
@@ -5452,34 +3983,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
5452 | return 0; | 3983 | return 0; |
5453 | } | 3984 | } |
5454 | 3985 | ||
5455 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | ||
5456 | int chunk) | ||
5457 | { | ||
5458 | int indirects; | ||
5459 | |||
5460 | /* if nrblocks are contiguous */ | ||
5461 | if (chunk) { | ||
5462 | /* | ||
5463 | * With N contiguous data blocks, we need at most | ||
5464 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
5465 | * 2 dindirect blocks, and 1 tindirect block | ||
5466 | */ | ||
5467 | return DIV_ROUND_UP(nrblocks, | ||
5468 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
5469 | } | ||
5470 | /* | ||
5471 | * if nrblocks are not contiguous, worse case, each block touch | ||
5472 | * a indirect block, and each indirect block touch a double indirect | ||
5473 | * block, plus a triple indirect block | ||
5474 | */ | ||
5475 | indirects = nrblocks * 2 + 1; | ||
5476 | return indirects; | ||
5477 | } | ||
5478 | |||
5479 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 3986 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5480 | { | 3987 | { |
5481 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 3988 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5482 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 3989 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); |
5483 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 3990 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5484 | } | 3991 | } |
5485 | 3992 | ||
@@ -5843,80 +4350,85 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
5843 | struct page *page = vmf->page; | 4350 | struct page *page = vmf->page; |
5844 | loff_t size; | 4351 | loff_t size; |
5845 | unsigned long len; | 4352 | unsigned long len; |
5846 | int ret = -EINVAL; | 4353 | int ret; |
5847 | void *fsdata; | ||
5848 | struct file *file = vma->vm_file; | 4354 | struct file *file = vma->vm_file; |
5849 | struct inode *inode = file->f_path.dentry->d_inode; | 4355 | struct inode *inode = file->f_path.dentry->d_inode; |
5850 | struct address_space *mapping = inode->i_mapping; | 4356 | struct address_space *mapping = inode->i_mapping; |
4357 | handle_t *handle; | ||
4358 | get_block_t *get_block; | ||
4359 | int retries = 0; | ||
5851 | 4360 | ||
5852 | /* | 4361 | /* |
5853 | * Get i_alloc_sem to stop truncates messing with the inode. We cannot | 4362 | * This check is racy but catches the common case. We rely on |
5854 | * get i_mutex because we are already holding mmap_sem. | 4363 | * __block_page_mkwrite() to do a reliable check. |
5855 | */ | 4364 | */ |
5856 | down_read(&inode->i_alloc_sem); | 4365 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
5857 | size = i_size_read(inode); | 4366 | /* Delalloc case is easy... */ |
5858 | if (page->mapping != mapping || size <= page_offset(page) | 4367 | if (test_opt(inode->i_sb, DELALLOC) && |
5859 | || !PageUptodate(page)) { | 4368 | !ext4_should_journal_data(inode) && |
5860 | /* page got truncated from under us? */ | 4369 | !ext4_nonda_switch(inode->i_sb)) { |
5861 | goto out_unlock; | 4370 | do { |
4371 | ret = __block_page_mkwrite(vma, vmf, | ||
4372 | ext4_da_get_block_prep); | ||
4373 | } while (ret == -ENOSPC && | ||
4374 | ext4_should_retry_alloc(inode->i_sb, &retries)); | ||
4375 | goto out_ret; | ||
5862 | } | 4376 | } |
5863 | ret = 0; | ||
5864 | 4377 | ||
5865 | lock_page(page); | 4378 | lock_page(page); |
5866 | wait_on_page_writeback(page); | 4379 | size = i_size_read(inode); |
5867 | if (PageMappedToDisk(page)) { | 4380 | /* Page got truncated from under us? */ |
5868 | up_read(&inode->i_alloc_sem); | 4381 | if (page->mapping != mapping || page_offset(page) > size) { |
5869 | return VM_FAULT_LOCKED; | 4382 | unlock_page(page); |
4383 | ret = VM_FAULT_NOPAGE; | ||
4384 | goto out; | ||
5870 | } | 4385 | } |
5871 | 4386 | ||
5872 | if (page->index == size >> PAGE_CACHE_SHIFT) | 4387 | if (page->index == size >> PAGE_CACHE_SHIFT) |
5873 | len = size & ~PAGE_CACHE_MASK; | 4388 | len = size & ~PAGE_CACHE_MASK; |
5874 | else | 4389 | else |
5875 | len = PAGE_CACHE_SIZE; | 4390 | len = PAGE_CACHE_SIZE; |
5876 | |||
5877 | /* | 4391 | /* |
5878 | * return if we have all the buffers mapped. This avoid | 4392 | * Return if we have all the buffers mapped. This avoids the need to do |
5879 | * the need to call write_begin/write_end which does a | 4393 | * journal_start/journal_stop which can block and take a long time |
5880 | * journal_start/journal_stop which can block and take | ||
5881 | * long time | ||
5882 | */ | 4394 | */ |
5883 | if (page_has_buffers(page)) { | 4395 | if (page_has_buffers(page)) { |
5884 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | 4396 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, |
5885 | ext4_bh_unmapped)) { | 4397 | ext4_bh_unmapped)) { |
5886 | up_read(&inode->i_alloc_sem); | 4398 | /* Wait so that we don't change page under IO */ |
5887 | return VM_FAULT_LOCKED; | 4399 | wait_on_page_writeback(page); |
4400 | ret = VM_FAULT_LOCKED; | ||
4401 | goto out; | ||
5888 | } | 4402 | } |
5889 | } | 4403 | } |
5890 | unlock_page(page); | 4404 | unlock_page(page); |
5891 | /* | 4405 | /* OK, we need to fill the hole... */ |
5892 | * OK, we need to fill the hole... Do write_begin write_end | 4406 | if (ext4_should_dioread_nolock(inode)) |
5893 | * to do block allocation/reservation.We are not holding | 4407 | get_block = ext4_get_block_write; |
5894 | * inode.i__mutex here. That allow * parallel write_begin, | 4408 | else |
5895 | * write_end call. lock_page prevent this from happening | 4409 | get_block = ext4_get_block; |
5896 | * on the same page though | 4410 | retry_alloc: |
5897 | */ | 4411 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); |
5898 | ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), | 4412 | if (IS_ERR(handle)) { |
5899 | len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); | ||
5900 | if (ret < 0) | ||
5901 | goto out_unlock; | ||
5902 | ret = mapping->a_ops->write_end(file, mapping, page_offset(page), | ||
5903 | len, len, page, fsdata); | ||
5904 | if (ret < 0) | ||
5905 | goto out_unlock; | ||
5906 | ret = 0; | ||
5907 | |||
5908 | /* | ||
5909 | * write_begin/end might have created a dirty page and someone | ||
5910 | * could wander in and start the IO. Make sure that hasn't | ||
5911 | * happened. | ||
5912 | */ | ||
5913 | lock_page(page); | ||
5914 | wait_on_page_writeback(page); | ||
5915 | up_read(&inode->i_alloc_sem); | ||
5916 | return VM_FAULT_LOCKED; | ||
5917 | out_unlock: | ||
5918 | if (ret) | ||
5919 | ret = VM_FAULT_SIGBUS; | 4413 | ret = VM_FAULT_SIGBUS; |
5920 | up_read(&inode->i_alloc_sem); | 4414 | goto out; |
4415 | } | ||
4416 | ret = __block_page_mkwrite(vma, vmf, get_block); | ||
4417 | if (!ret && ext4_should_journal_data(inode)) { | ||
4418 | if (walk_page_buffers(handle, page_buffers(page), 0, | ||
4419 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) { | ||
4420 | unlock_page(page); | ||
4421 | ret = VM_FAULT_SIGBUS; | ||
4422 | ext4_journal_stop(handle); | ||
4423 | goto out; | ||
4424 | } | ||
4425 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | ||
4426 | } | ||
4427 | ext4_journal_stop(handle); | ||
4428 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
4429 | goto retry_alloc; | ||
4430 | out_ret: | ||
4431 | ret = block_page_mkwrite_return(ret); | ||
4432 | out: | ||
5921 | return ret; | 4433 | return ret; |
5922 | } | 4434 | } |
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e773..f18bfe37aff 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -202,8 +202,9 @@ setversion_out: | |||
202 | struct super_block *sb = inode->i_sb; | 202 | struct super_block *sb = inode->i_sb; |
203 | int err, err2=0; | 203 | int err, err2=0; |
204 | 204 | ||
205 | if (!capable(CAP_SYS_RESOURCE)) | 205 | err = ext4_resize_begin(sb); |
206 | return -EPERM; | 206 | if (err) |
207 | return err; | ||
207 | 208 | ||
208 | if (get_user(n_blocks_count, (__u32 __user *)arg)) | 209 | if (get_user(n_blocks_count, (__u32 __user *)arg)) |
209 | return -EFAULT; | 210 | return -EFAULT; |
@@ -221,6 +222,7 @@ setversion_out: | |||
221 | if (err == 0) | 222 | if (err == 0) |
222 | err = err2; | 223 | err = err2; |
223 | mnt_drop_write(filp->f_path.mnt); | 224 | mnt_drop_write(filp->f_path.mnt); |
225 | ext4_resize_end(sb); | ||
224 | 226 | ||
225 | return err; | 227 | return err; |
226 | } | 228 | } |
@@ -271,8 +273,9 @@ mext_out: | |||
271 | struct super_block *sb = inode->i_sb; | 273 | struct super_block *sb = inode->i_sb; |
272 | int err, err2=0; | 274 | int err, err2=0; |
273 | 275 | ||
274 | if (!capable(CAP_SYS_RESOURCE)) | 276 | err = ext4_resize_begin(sb); |
275 | return -EPERM; | 277 | if (err) |
278 | return err; | ||
276 | 279 | ||
277 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, | 280 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, |
278 | sizeof(input))) | 281 | sizeof(input))) |
@@ -291,6 +294,7 @@ mext_out: | |||
291 | if (err == 0) | 294 | if (err == 0) |
292 | err = err2; | 295 | err = err2; |
293 | mnt_drop_write(filp->f_path.mnt); | 296 | mnt_drop_write(filp->f_path.mnt); |
297 | ext4_resize_end(sb); | ||
294 | 298 | ||
295 | return err; | 299 | return err; |
296 | } | 300 | } |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d5685..17a5a57c415 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -75,8 +75,8 @@ | |||
75 | * | 75 | * |
76 | * The inode preallocation space is used looking at the _logical_ start | 76 | * The inode preallocation space is used looking at the _logical_ start |
77 | * block. If only the logical file block falls within the range of prealloc | 77 | * block. If only the logical file block falls within the range of prealloc |
78 | * space we will consume the particular prealloc space. This make sure that | 78 | * space we will consume the particular prealloc space. This makes sure that |
79 | * that the we have contiguous physical blocks representing the file blocks | 79 | * we have contiguous physical blocks representing the file blocks |
80 | * | 80 | * |
81 | * The important thing to be noted in case of inode prealloc space is that | 81 | * The important thing to be noted in case of inode prealloc space is that |
82 | * we don't modify the values associated to inode prealloc space except | 82 | * we don't modify the values associated to inode prealloc space except |
@@ -84,7 +84,7 @@ | |||
84 | * | 84 | * |
85 | * If we are not able to find blocks in the inode prealloc space and if we | 85 | * If we are not able to find blocks in the inode prealloc space and if we |
86 | * have the group allocation flag set then we look at the locality group | 86 | * have the group allocation flag set then we look at the locality group |
87 | * prealloc space. These are per CPU prealloc list repreasented as | 87 | * prealloc space. These are per CPU prealloc list represented as |
88 | * | 88 | * |
89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] | 89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] |
90 | * | 90 | * |
@@ -128,12 +128,13 @@ | |||
128 | * we are doing a group prealloc we try to normalize the request to | 128 | * we are doing a group prealloc we try to normalize the request to |
129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is | 129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is |
130 | * 512 blocks. This can be tuned via | 130 | * 512 blocks. This can be tuned via |
131 | * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in | 131 | * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in |
132 | * terms of number of blocks. If we have mounted the file system with -O | 132 | * terms of number of blocks. If we have mounted the file system with -O |
133 | * stripe=<value> option the group prealloc request is normalized to the | 133 | * stripe=<value> option the group prealloc request is normalized to the |
134 | * stripe value (sbi->s_stripe) | 134 | * the smallest multiple of the stripe value (sbi->s_stripe) which is |
135 | * greater than the default mb_group_prealloc. | ||
135 | * | 136 | * |
136 | * The regular allocator(using the buddy cache) supports few tunables. | 137 | * The regular allocator (using the buddy cache) supports a few tunables. |
137 | * | 138 | * |
138 | * /sys/fs/ext4/<partition>/mb_min_to_scan | 139 | * /sys/fs/ext4/<partition>/mb_min_to_scan |
139 | * /sys/fs/ext4/<partition>/mb_max_to_scan | 140 | * /sys/fs/ext4/<partition>/mb_max_to_scan |
@@ -152,7 +153,7 @@ | |||
152 | * best extent in the found extents. Searching for the blocks starts with | 153 | * best extent in the found extents. Searching for the blocks starts with |
153 | * the group specified as the goal value in allocation context via | 154 | * the group specified as the goal value in allocation context via |
154 | * ac_g_ex. Each group is first checked based on the criteria whether it | 155 | * ac_g_ex. Each group is first checked based on the criteria whether it |
155 | * can used for allocation. ext4_mb_good_group explains how the groups are | 156 | * can be used for allocation. ext4_mb_good_group explains how the groups are |
156 | * checked. | 157 | * checked. |
157 | * | 158 | * |
158 | * Both the prealloc space are getting populated as above. So for the first | 159 | * Both the prealloc space are getting populated as above. So for the first |
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | |||
492 | b2 = (unsigned char *) bitmap; | 493 | b2 = (unsigned char *) bitmap; |
493 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | 494 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { |
494 | if (b1[i] != b2[i]) { | 495 | if (b1[i] != b2[i]) { |
495 | printk(KERN_ERR "corruption in group %u " | 496 | ext4_msg(e4b->bd_sb, KERN_ERR, |
496 | "at byte %u(%u): %x in copy != %x " | 497 | "corruption in group %u " |
497 | "on disk/prealloc\n", | 498 | "at byte %u(%u): %x in copy != %x " |
498 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | 499 | "on disk/prealloc", |
500 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | ||
499 | BUG(); | 501 | BUG(); |
500 | } | 502 | } |
501 | } | 503 | } |
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1125 | grp = ext4_get_group_info(sb, group); | 1127 | grp = ext4_get_group_info(sb, group); |
1126 | 1128 | ||
1127 | e4b->bd_blkbits = sb->s_blocksize_bits; | 1129 | e4b->bd_blkbits = sb->s_blocksize_bits; |
1128 | e4b->bd_info = ext4_get_group_info(sb, group); | 1130 | e4b->bd_info = grp; |
1129 | e4b->bd_sb = sb; | 1131 | e4b->bd_sb = sb; |
1130 | e4b->bd_group = group; | 1132 | e4b->bd_group = group; |
1131 | e4b->bd_buddy_page = NULL; | 1133 | e4b->bd_buddy_page = NULL; |
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len) | |||
1281 | } | 1283 | } |
1282 | } | 1284 | } |
1283 | 1285 | ||
1284 | static void mb_set_bits(void *bm, int cur, int len) | 1286 | void ext4_set_bits(void *bm, int cur, int len) |
1285 | { | 1287 | { |
1286 | __u32 *addr; | 1288 | __u32 *addr; |
1287 | 1289 | ||
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | |||
1510 | } | 1512 | } |
1511 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); | 1513 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); |
1512 | 1514 | ||
1513 | mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | 1515 | ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); |
1514 | mb_check_buddy(e4b); | 1516 | mb_check_buddy(e4b); |
1515 | 1517 | ||
1516 | return ret; | 1518 | return ret; |
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2223 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2225 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2224 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | 2226 | meta_group_info = kmalloc(metalen, GFP_KERNEL); |
2225 | if (meta_group_info == NULL) { | 2227 | if (meta_group_info == NULL) { |
2226 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | 2228 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " |
2227 | "buddy group\n"); | 2229 | "for a buddy group"); |
2228 | goto exit_meta_group_info; | 2230 | goto exit_meta_group_info; |
2229 | } | 2231 | } |
2230 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = | 2232 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = |
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2237 | 2239 | ||
2238 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); | 2240 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); |
2239 | if (meta_group_info[i] == NULL) { | 2241 | if (meta_group_info[i] == NULL) { |
2240 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | 2242 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); |
2241 | goto exit_group_info; | 2243 | goto exit_group_info; |
2242 | } | 2244 | } |
2243 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); | 2245 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); |
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2279 | 2281 | ||
2280 | exit_group_info: | 2282 | exit_group_info: |
2281 | /* If a meta_group_info table has been allocated, release it now */ | 2283 | /* If a meta_group_info table has been allocated, release it now */ |
2282 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) | 2284 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { |
2283 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); | 2285 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); |
2286 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; | ||
2287 | } | ||
2284 | exit_meta_group_info: | 2288 | exit_meta_group_info: |
2285 | return -ENOMEM; | 2289 | return -ENOMEM; |
2286 | } /* ext4_mb_add_groupinfo */ | 2290 | } /* ext4_mb_add_groupinfo */ |
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb) | |||
2328 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | 2332 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte |
2329 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | 2333 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. |
2330 | * So a two level scheme suffices for now. */ | 2334 | * So a two level scheme suffices for now. */ |
2331 | sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); | 2335 | sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); |
2332 | if (sbi->s_group_info == NULL) { | 2336 | if (sbi->s_group_info == NULL) { |
2333 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | 2337 | ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); |
2334 | return -ENOMEM; | 2338 | return -ENOMEM; |
2335 | } | 2339 | } |
2336 | sbi->s_buddy_cache = new_inode(sb); | 2340 | sbi->s_buddy_cache = new_inode(sb); |
2337 | if (sbi->s_buddy_cache == NULL) { | 2341 | if (sbi->s_buddy_cache == NULL) { |
2338 | printk(KERN_ERR "EXT4-fs: can't get new inode\n"); | 2342 | ext4_msg(sb, KERN_ERR, "can't get new inode"); |
2339 | goto err_freesgi; | 2343 | goto err_freesgi; |
2340 | } | 2344 | } |
2341 | sbi->s_buddy_cache->i_ino = get_next_ino(); | 2345 | /* To avoid potentially colliding with an valid on-disk inode number, |
2346 | * use EXT4_BAD_INO for the buddy cache inode number. This inode is | ||
2347 | * not in the inode hash, so it should never be found by iget(), but | ||
2348 | * this will avoid confusion if it ever shows up during debugging. */ | ||
2349 | sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; | ||
2342 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | 2350 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; |
2343 | for (i = 0; i < ngroups; i++) { | 2351 | for (i = 0; i < ngroups; i++) { |
2344 | desc = ext4_get_group_desc(sb, i, NULL); | 2352 | desc = ext4_get_group_desc(sb, i, NULL); |
2345 | if (desc == NULL) { | 2353 | if (desc == NULL) { |
2346 | printk(KERN_ERR | 2354 | ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); |
2347 | "EXT4-fs: can't read descriptor %u\n", i); | ||
2348 | goto err_freebuddy; | 2355 | goto err_freebuddy; |
2349 | } | 2356 | } |
2350 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) | 2357 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
@@ -2362,7 +2369,7 @@ err_freebuddy: | |||
2362 | kfree(sbi->s_group_info[i]); | 2369 | kfree(sbi->s_group_info[i]); |
2363 | iput(sbi->s_buddy_cache); | 2370 | iput(sbi->s_buddy_cache); |
2364 | err_freesgi: | 2371 | err_freesgi: |
2365 | kfree(sbi->s_group_info); | 2372 | ext4_kvfree(sbi->s_group_info); |
2366 | return -ENOMEM; | 2373 | return -ENOMEM; |
2367 | } | 2374 | } |
2368 | 2375 | ||
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size) | |||
2404 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, | 2411 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, |
2405 | NULL); | 2412 | NULL); |
2406 | 2413 | ||
2414 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2415 | |||
2407 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); | 2416 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); |
2408 | if (!cachep) { | 2417 | if (!cachep) { |
2409 | printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); | 2418 | printk(KERN_EMERG |
2419 | "EXT4-fs: no memory for groupinfo slab cache\n"); | ||
2410 | return -ENOMEM; | 2420 | return -ENOMEM; |
2411 | } | 2421 | } |
2412 | 2422 | ||
2413 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2414 | |||
2415 | return 0; | 2423 | return 0; |
2416 | } | 2424 | } |
2417 | 2425 | ||
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2457 | i++; | 2465 | i++; |
2458 | } while (i <= sb->s_blocksize_bits + 1); | 2466 | } while (i <= sb->s_blocksize_bits + 1); |
2459 | 2467 | ||
2460 | /* init file for buddy data */ | ||
2461 | ret = ext4_mb_init_backend(sb); | ||
2462 | if (ret != 0) { | ||
2463 | goto out; | ||
2464 | } | ||
2465 | |||
2466 | spin_lock_init(&sbi->s_md_lock); | 2468 | spin_lock_init(&sbi->s_md_lock); |
2467 | spin_lock_init(&sbi->s_bal_lock); | 2469 | spin_lock_init(&sbi->s_bal_lock); |
2468 | 2470 | ||
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2472 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; | 2474 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; |
2473 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; | 2475 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; |
2474 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | 2476 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; |
2477 | /* | ||
2478 | * If there is a s_stripe > 1, then we set the s_mb_group_prealloc | ||
2479 | * to the lowest multiple of s_stripe which is bigger than | ||
2480 | * the s_mb_group_prealloc as determined above. We want | ||
2481 | * the preallocation size to be an exact multiple of the | ||
2482 | * RAID stripe size so that preallocations don't fragment | ||
2483 | * the stripes. | ||
2484 | */ | ||
2485 | if (sbi->s_stripe > 1) { | ||
2486 | sbi->s_mb_group_prealloc = roundup( | ||
2487 | sbi->s_mb_group_prealloc, sbi->s_stripe); | ||
2488 | } | ||
2475 | 2489 | ||
2476 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); | 2490 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); |
2477 | if (sbi->s_locality_groups == NULL) { | 2491 | if (sbi->s_locality_groups == NULL) { |
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2487 | spin_lock_init(&lg->lg_prealloc_lock); | 2501 | spin_lock_init(&lg->lg_prealloc_lock); |
2488 | } | 2502 | } |
2489 | 2503 | ||
2504 | /* init file for buddy data */ | ||
2505 | ret = ext4_mb_init_backend(sb); | ||
2506 | if (ret != 0) { | ||
2507 | goto out; | ||
2508 | } | ||
2509 | |||
2490 | if (sbi->s_proc) | 2510 | if (sbi->s_proc) |
2491 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, | 2511 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, |
2492 | &ext4_mb_seq_groups_fops, sb); | 2512 | &ext4_mb_seq_groups_fops, sb); |
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb) | |||
2544 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2564 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2545 | for (i = 0; i < num_meta_group_infos; i++) | 2565 | for (i = 0; i < num_meta_group_infos; i++) |
2546 | kfree(sbi->s_group_info[i]); | 2566 | kfree(sbi->s_group_info[i]); |
2547 | kfree(sbi->s_group_info); | 2567 | ext4_kvfree(sbi->s_group_info); |
2548 | } | 2568 | } |
2549 | kfree(sbi->s_mb_offsets); | 2569 | kfree(sbi->s_mb_offsets); |
2550 | kfree(sbi->s_mb_maxs); | 2570 | kfree(sbi->s_mb_maxs); |
2551 | if (sbi->s_buddy_cache) | 2571 | if (sbi->s_buddy_cache) |
2552 | iput(sbi->s_buddy_cache); | 2572 | iput(sbi->s_buddy_cache); |
2553 | if (sbi->s_mb_stats) { | 2573 | if (sbi->s_mb_stats) { |
2554 | printk(KERN_INFO | 2574 | ext4_msg(sb, KERN_INFO, |
2555 | "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", | 2575 | "mballoc: %u blocks %u reqs (%u success)", |
2556 | atomic_read(&sbi->s_bal_allocated), | 2576 | atomic_read(&sbi->s_bal_allocated), |
2557 | atomic_read(&sbi->s_bal_reqs), | 2577 | atomic_read(&sbi->s_bal_reqs), |
2558 | atomic_read(&sbi->s_bal_success)); | 2578 | atomic_read(&sbi->s_bal_success)); |
2559 | printk(KERN_INFO | 2579 | ext4_msg(sb, KERN_INFO, |
2560 | "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " | 2580 | "mballoc: %u extents scanned, %u goal hits, " |
2561 | "%u 2^N hits, %u breaks, %u lost\n", | 2581 | "%u 2^N hits, %u breaks, %u lost", |
2562 | atomic_read(&sbi->s_bal_ex_scanned), | 2582 | atomic_read(&sbi->s_bal_ex_scanned), |
2563 | atomic_read(&sbi->s_bal_goals), | 2583 | atomic_read(&sbi->s_bal_goals), |
2564 | atomic_read(&sbi->s_bal_2orders), | 2584 | atomic_read(&sbi->s_bal_2orders), |
2565 | atomic_read(&sbi->s_bal_breaks), | 2585 | atomic_read(&sbi->s_bal_breaks), |
2566 | atomic_read(&sbi->s_mb_lost_chunks)); | 2586 | atomic_read(&sbi->s_mb_lost_chunks)); |
2567 | printk(KERN_INFO | 2587 | ext4_msg(sb, KERN_INFO, |
2568 | "EXT4-fs: mballoc: %lu generated and it took %Lu\n", | 2588 | "mballoc: %lu generated and it took %Lu", |
2569 | sbi->s_mb_buddies_generated++, | 2589 | sbi->s_mb_buddies_generated, |
2570 | sbi->s_mb_generation_time); | 2590 | sbi->s_mb_generation_time); |
2571 | printk(KERN_INFO | 2591 | ext4_msg(sb, KERN_INFO, |
2572 | "EXT4-fs: mballoc: %u preallocated, %u discarded\n", | 2592 | "mballoc: %u preallocated, %u discarded", |
2573 | atomic_read(&sbi->s_mb_preallocated), | 2593 | atomic_read(&sbi->s_mb_preallocated), |
2574 | atomic_read(&sbi->s_mb_discarded)); | 2594 | atomic_read(&sbi->s_mb_discarded)); |
2575 | } | 2595 | } |
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | |||
2628 | rb_erase(&entry->node, &(db->bb_free_root)); | 2648 | rb_erase(&entry->node, &(db->bb_free_root)); |
2629 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); | 2649 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); |
2630 | 2650 | ||
2651 | /* | ||
2652 | * Clear the trimmed flag for the group so that the next | ||
2653 | * ext4_trim_fs can trim it. | ||
2654 | * If the volume is mounted with -o discard, online discard | ||
2655 | * is supported and the free blocks will be trimmed online. | ||
2656 | */ | ||
2657 | if (!test_opt(sb, DISCARD)) | ||
2658 | EXT4_MB_GRP_CLEAR_TRIMMED(db); | ||
2659 | |||
2631 | if (!db->bb_free_root.rb_node) { | 2660 | if (!db->bb_free_root.rb_node) { |
2632 | /* No more items in the per group rb tree | 2661 | /* No more items in the per group rb tree |
2633 | * balance refcounts from ext4_mb_free_metadata() | 2662 | * balance refcounts from ext4_mb_free_metadata() |
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2771 | * We leak some of the blocks here. | 2800 | * We leak some of the blocks here. |
2772 | */ | 2801 | */ |
2773 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | 2802 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); |
2774 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, | 2803 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2775 | ac->ac_b_ex.fe_len); | 2804 | ac->ac_b_ex.fe_len); |
2776 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | 2805 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
2777 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); | 2806 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
2778 | if (!err) | 2807 | if (!err) |
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2790 | } | 2819 | } |
2791 | } | 2820 | } |
2792 | #endif | 2821 | #endif |
2793 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); | 2822 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2823 | ac->ac_b_ex.fe_len); | ||
2794 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 2824 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
2795 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | 2825 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
2796 | ext4_free_blks_set(sb, gdp, | 2826 | ext4_free_blks_set(sb, gdp, |
@@ -2830,8 +2860,9 @@ out_err: | |||
2830 | 2860 | ||
2831 | /* | 2861 | /* |
2832 | * here we normalize request for locality group | 2862 | * here we normalize request for locality group |
2833 | * Group request are normalized to s_strip size if we set the same via mount | 2863 | * Group request are normalized to s_mb_group_prealloc, which goes to |
2834 | * option. If not we set it to s_mb_group_prealloc which can be configured via | 2864 | * s_strip if we set the same via mount option. |
2865 | * s_mb_group_prealloc can be configured via | ||
2835 | * /sys/fs/ext4/<partition>/mb_group_prealloc | 2866 | * /sys/fs/ext4/<partition>/mb_group_prealloc |
2836 | * | 2867 | * |
2837 | * XXX: should we try to preallocate more than the group has now? | 2868 | * XXX: should we try to preallocate more than the group has now? |
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) | |||
2842 | struct ext4_locality_group *lg = ac->ac_lg; | 2873 | struct ext4_locality_group *lg = ac->ac_lg; |
2843 | 2874 | ||
2844 | BUG_ON(lg == NULL); | 2875 | BUG_ON(lg == NULL); |
2845 | if (EXT4_SB(sb)->s_stripe) | 2876 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; |
2846 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; | ||
2847 | else | ||
2848 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; | ||
2849 | mb_debug(1, "#%u: goal %u blocks for locality group\n", | 2877 | mb_debug(1, "#%u: goal %u blocks for locality group\n", |
2850 | current->pid, ac->ac_g_ex.fe_len); | 2878 | current->pid, ac->ac_g_ex.fe_len); |
2851 | } | 2879 | } |
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, | |||
3001 | 3029 | ||
3002 | if (start + size <= ac->ac_o_ex.fe_logical && | 3030 | if (start + size <= ac->ac_o_ex.fe_logical && |
3003 | start > ac->ac_o_ex.fe_logical) { | 3031 | start > ac->ac_o_ex.fe_logical) { |
3004 | printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", | 3032 | ext4_msg(ac->ac_sb, KERN_ERR, |
3005 | (unsigned long) start, (unsigned long) size, | 3033 | "start %lu, size %lu, fe_logical %lu", |
3006 | (unsigned long) ac->ac_o_ex.fe_logical); | 3034 | (unsigned long) start, (unsigned long) size, |
3035 | (unsigned long) ac->ac_o_ex.fe_logical); | ||
3007 | } | 3036 | } |
3008 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && | 3037 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && |
3009 | start > ac->ac_o_ex.fe_logical); | 3038 | start > ac->ac_o_ex.fe_logical); |
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | |||
3262 | 3291 | ||
3263 | while (n) { | 3292 | while (n) { |
3264 | entry = rb_entry(n, struct ext4_free_data, node); | 3293 | entry = rb_entry(n, struct ext4_free_data, node); |
3265 | mb_set_bits(bitmap, entry->start_blk, entry->count); | 3294 | ext4_set_bits(bitmap, entry->start_blk, entry->count); |
3266 | n = rb_next(n); | 3295 | n = rb_next(n); |
3267 | } | 3296 | } |
3268 | return; | 3297 | return; |
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |||
3304 | if (unlikely(len == 0)) | 3333 | if (unlikely(len == 0)) |
3305 | continue; | 3334 | continue; |
3306 | BUG_ON(groupnr != group); | 3335 | BUG_ON(groupnr != group); |
3307 | mb_set_bits(bitmap, start, len); | 3336 | ext4_set_bits(bitmap, start, len); |
3308 | preallocated += len; | 3337 | preallocated += len; |
3309 | count++; | 3338 | count++; |
3310 | } | 3339 | } |
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3584 | bit = next + 1; | 3613 | bit = next + 1; |
3585 | } | 3614 | } |
3586 | if (free != pa->pa_free) { | 3615 | if (free != pa->pa_free) { |
3587 | printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", | 3616 | ext4_msg(e4b->bd_sb, KERN_CRIT, |
3588 | pa, (unsigned long) pa->pa_lstart, | 3617 | "pa %p: logic %lu, phys. %lu, len %lu", |
3589 | (unsigned long) pa->pa_pstart, | 3618 | pa, (unsigned long) pa->pa_lstart, |
3590 | (unsigned long) pa->pa_len); | 3619 | (unsigned long) pa->pa_pstart, |
3620 | (unsigned long) pa->pa_len); | ||
3591 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", | 3621 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", |
3592 | free, pa->pa_free); | 3622 | free, pa->pa_free); |
3593 | /* | 3623 | /* |
@@ -3775,7 +3805,8 @@ repeat: | |||
3775 | * use preallocation while we're discarding it */ | 3805 | * use preallocation while we're discarding it */ |
3776 | spin_unlock(&pa->pa_lock); | 3806 | spin_unlock(&pa->pa_lock); |
3777 | spin_unlock(&ei->i_prealloc_lock); | 3807 | spin_unlock(&ei->i_prealloc_lock); |
3778 | printk(KERN_ERR "uh-oh! used pa while discarding\n"); | 3808 | ext4_msg(sb, KERN_ERR, |
3809 | "uh-oh! used pa while discarding"); | ||
3779 | WARN_ON(1); | 3810 | WARN_ON(1); |
3780 | schedule_timeout_uninterruptible(HZ); | 3811 | schedule_timeout_uninterruptible(HZ); |
3781 | goto repeat; | 3812 | goto repeat; |
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3852 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) | 3883 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) |
3853 | return; | 3884 | return; |
3854 | 3885 | ||
3855 | printk(KERN_ERR "EXT4-fs: Can't allocate:" | 3886 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" |
3856 | " Allocation context details:\n"); | 3887 | " Allocation context details:"); |
3857 | printk(KERN_ERR "EXT4-fs: status %d flags %d\n", | 3888 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", |
3858 | ac->ac_status, ac->ac_flags); | 3889 | ac->ac_status, ac->ac_flags); |
3859 | printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " | 3890 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " |
3860 | "best %lu/%lu/%lu@%lu cr %d\n", | 3891 | "goal %lu/%lu/%lu@%lu, " |
3892 | "best %lu/%lu/%lu@%lu cr %d", | ||
3861 | (unsigned long)ac->ac_o_ex.fe_group, | 3893 | (unsigned long)ac->ac_o_ex.fe_group, |
3862 | (unsigned long)ac->ac_o_ex.fe_start, | 3894 | (unsigned long)ac->ac_o_ex.fe_start, |
3863 | (unsigned long)ac->ac_o_ex.fe_len, | 3895 | (unsigned long)ac->ac_o_ex.fe_len, |
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3871 | (unsigned long)ac->ac_b_ex.fe_len, | 3903 | (unsigned long)ac->ac_b_ex.fe_len, |
3872 | (unsigned long)ac->ac_b_ex.fe_logical, | 3904 | (unsigned long)ac->ac_b_ex.fe_logical, |
3873 | (int)ac->ac_criteria); | 3905 | (int)ac->ac_criteria); |
3874 | printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, | 3906 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", |
3875 | ac->ac_found); | 3907 | ac->ac_ex_scanned, ac->ac_found); |
3876 | printk(KERN_ERR "EXT4-fs: groups: \n"); | 3908 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); |
3877 | ngroups = ext4_get_groups_count(sb); | 3909 | ngroups = ext4_get_groups_count(sb); |
3878 | for (i = 0; i < ngroups; i++) { | 3910 | for (i = 0; i < ngroups; i++) { |
3879 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); | 3911 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); |
@@ -4637,7 +4669,7 @@ do_more: | |||
4637 | } | 4669 | } |
4638 | ext4_mark_super_dirty(sb); | 4670 | ext4_mark_super_dirty(sb); |
4639 | error_return: | 4671 | error_return: |
4640 | if (freed) | 4672 | if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) |
4641 | dquot_free_block(inode, freed); | 4673 | dquot_free_block(inode, freed); |
4642 | brelse(bitmap_bh); | 4674 | brelse(bitmap_bh); |
4643 | ext4_std_error(sb, err); | 4675 | ext4_std_error(sb, err); |
@@ -4645,7 +4677,7 @@ error_return: | |||
4645 | } | 4677 | } |
4646 | 4678 | ||
4647 | /** | 4679 | /** |
4648 | * ext4_add_groupblocks() -- Add given blocks to an existing group | 4680 | * ext4_group_add_blocks() -- Add given blocks to an existing group |
4649 | * @handle: handle to this transaction | 4681 | * @handle: handle to this transaction |
4650 | * @sb: super block | 4682 | * @sb: super block |
4651 | * @block: start physcial block to add to the block group | 4683 | * @block: start physcial block to add to the block group |
@@ -4653,7 +4685,7 @@ error_return: | |||
4653 | * | 4685 | * |
4654 | * This marks the blocks as free in the bitmap and buddy. | 4686 | * This marks the blocks as free in the bitmap and buddy. |
4655 | */ | 4687 | */ |
4656 | void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 4688 | int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
4657 | ext4_fsblk_t block, unsigned long count) | 4689 | ext4_fsblk_t block, unsigned long count) |
4658 | { | 4690 | { |
4659 | struct buffer_head *bitmap_bh = NULL; | 4691 | struct buffer_head *bitmap_bh = NULL; |
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4666 | struct ext4_buddy e4b; | 4698 | struct ext4_buddy e4b; |
4667 | int err = 0, ret, blk_free_count; | 4699 | int err = 0, ret, blk_free_count; |
4668 | ext4_grpblk_t blocks_freed; | 4700 | ext4_grpblk_t blocks_freed; |
4669 | struct ext4_group_info *grp; | ||
4670 | 4701 | ||
4671 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); | 4702 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); |
4672 | 4703 | ||
4704 | if (count == 0) | ||
4705 | return 0; | ||
4706 | |||
4673 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | 4707 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); |
4674 | grp = ext4_get_group_info(sb, block_group); | ||
4675 | /* | 4708 | /* |
4676 | * Check to see if we are freeing blocks across a group | 4709 | * Check to see if we are freeing blocks across a group |
4677 | * boundary. | 4710 | * boundary. |
4678 | */ | 4711 | */ |
4679 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) | 4712 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { |
4713 | ext4_warning(sb, "too much blocks added to group %u\n", | ||
4714 | block_group); | ||
4715 | err = -EINVAL; | ||
4680 | goto error_return; | 4716 | goto error_return; |
4717 | } | ||
4681 | 4718 | ||
4682 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); | 4719 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
4683 | if (!bitmap_bh) | 4720 | if (!bitmap_bh) { |
4721 | err = -EIO; | ||
4684 | goto error_return; | 4722 | goto error_return; |
4723 | } | ||
4724 | |||
4685 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); | 4725 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); |
4686 | if (!desc) | 4726 | if (!desc) { |
4727 | err = -EIO; | ||
4687 | goto error_return; | 4728 | goto error_return; |
4729 | } | ||
4688 | 4730 | ||
4689 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || | 4731 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || |
4690 | in_range(ext4_inode_bitmap(sb, desc), block, count) || | 4732 | in_range(ext4_inode_bitmap(sb, desc), block, count) || |
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4694 | ext4_error(sb, "Adding blocks in system zones - " | 4736 | ext4_error(sb, "Adding blocks in system zones - " |
4695 | "Block = %llu, count = %lu", | 4737 | "Block = %llu, count = %lu", |
4696 | block, count); | 4738 | block, count); |
4739 | err = -EINVAL; | ||
4697 | goto error_return; | 4740 | goto error_return; |
4698 | } | 4741 | } |
4699 | 4742 | ||
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4762 | error_return: | 4805 | error_return: |
4763 | brelse(bitmap_bh); | 4806 | brelse(bitmap_bh); |
4764 | ext4_std_error(sb, err); | 4807 | ext4_std_error(sb, err); |
4765 | return; | 4808 | return err; |
4766 | } | 4809 | } |
4767 | 4810 | ||
4768 | /** | 4811 | /** |
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4782 | { | 4825 | { |
4783 | struct ext4_free_extent ex; | 4826 | struct ext4_free_extent ex; |
4784 | 4827 | ||
4828 | trace_ext4_trim_extent(sb, group, start, count); | ||
4829 | |||
4785 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); | 4830 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); |
4786 | 4831 | ||
4787 | ex.fe_start = start; | 4832 | ex.fe_start = start; |
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4802 | /** | 4847 | /** |
4803 | * ext4_trim_all_free -- function to trim all free space in alloc. group | 4848 | * ext4_trim_all_free -- function to trim all free space in alloc. group |
4804 | * @sb: super block for file system | 4849 | * @sb: super block for file system |
4805 | * @e4b: ext4 buddy | 4850 | * @group: group to be trimmed |
4806 | * @start: first group block to examine | 4851 | * @start: first group block to examine |
4807 | * @max: last group block to examine | 4852 | * @max: last group block to examine |
4808 | * @minblocks: minimum extent block count | 4853 | * @minblocks: minimum extent block count |
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4823 | ext4_grpblk_t minblocks) | 4868 | ext4_grpblk_t minblocks) |
4824 | { | 4869 | { |
4825 | void *bitmap; | 4870 | void *bitmap; |
4826 | ext4_grpblk_t next, count = 0; | 4871 | ext4_grpblk_t next, count = 0, free_count = 0; |
4827 | struct ext4_buddy e4b; | 4872 | struct ext4_buddy e4b; |
4828 | int ret; | 4873 | int ret; |
4829 | 4874 | ||
4875 | trace_ext4_trim_all_free(sb, group, start, max); | ||
4876 | |||
4830 | ret = ext4_mb_load_buddy(sb, group, &e4b); | 4877 | ret = ext4_mb_load_buddy(sb, group, &e4b); |
4831 | if (ret) { | 4878 | if (ret) { |
4832 | ext4_error(sb, "Error in loading buddy " | 4879 | ext4_error(sb, "Error in loading buddy " |
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4836 | bitmap = e4b.bd_bitmap; | 4883 | bitmap = e4b.bd_bitmap; |
4837 | 4884 | ||
4838 | ext4_lock_group(sb, group); | 4885 | ext4_lock_group(sb, group); |
4886 | if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && | ||
4887 | minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) | ||
4888 | goto out; | ||
4889 | |||
4839 | start = (e4b.bd_info->bb_first_free > start) ? | 4890 | start = (e4b.bd_info->bb_first_free > start) ? |
4840 | e4b.bd_info->bb_first_free : start; | 4891 | e4b.bd_info->bb_first_free : start; |
4841 | 4892 | ||
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4850 | next - start, group, &e4b); | 4901 | next - start, group, &e4b); |
4851 | count += next - start; | 4902 | count += next - start; |
4852 | } | 4903 | } |
4904 | free_count += next - start; | ||
4853 | start = next + 1; | 4905 | start = next + 1; |
4854 | 4906 | ||
4855 | if (fatal_signal_pending(current)) { | 4907 | if (fatal_signal_pending(current)) { |
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4863 | ext4_lock_group(sb, group); | 4915 | ext4_lock_group(sb, group); |
4864 | } | 4916 | } |
4865 | 4917 | ||
4866 | if ((e4b.bd_info->bb_free - count) < minblocks) | 4918 | if ((e4b.bd_info->bb_free - free_count) < minblocks) |
4867 | break; | 4919 | break; |
4868 | } | 4920 | } |
4921 | |||
4922 | if (!ret) | ||
4923 | EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); | ||
4924 | out: | ||
4869 | ext4_unlock_group(sb, group); | 4925 | ext4_unlock_group(sb, group); |
4870 | ext4_mb_unload_buddy(&e4b); | 4926 | ext4_mb_unload_buddy(&e4b); |
4871 | 4927 | ||
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4904 | 4960 | ||
4905 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) | 4961 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) |
4906 | return -EINVAL; | 4962 | return -EINVAL; |
4963 | if (start + len <= first_data_blk) | ||
4964 | goto out; | ||
4907 | if (start < first_data_blk) { | 4965 | if (start < first_data_blk) { |
4908 | len -= first_data_blk - start; | 4966 | len -= first_data_blk - start; |
4909 | start = first_data_blk; | 4967 | start = first_data_blk; |
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4952 | } | 5010 | } |
4953 | range->len = trimmed * sb->s_blocksize; | 5011 | range->len = trimmed * sb->s_blocksize; |
4954 | 5012 | ||
5013 | if (!ret) | ||
5014 | atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); | ||
5015 | |||
5016 | out: | ||
4955 | return ret; | 5017 | return ret; |
4956 | } | 5018 | } |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd..9d4a636b546 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -187,7 +187,6 @@ struct ext4_allocation_context { | |||
187 | __u16 ac_flags; /* allocation hints */ | 187 | __u16 ac_flags; /* allocation hints */ |
188 | __u8 ac_status; | 188 | __u8 ac_status; |
189 | __u8 ac_criteria; | 189 | __u8 ac_criteria; |
190 | __u8 ac_repeats; | ||
191 | __u8 ac_2order; /* if request is to allocate 2^N blocks and | 190 | __u8 ac_2order; /* if request is to allocate 2^N blocks and |
192 | * N > 0, the field stores N, otherwise 0 */ | 191 | * N > 0, the field stores N, otherwise 0 */ |
193 | __u8 ac_op; /* operation, for history only */ | 192 | __u8 ac_op; /* operation, for history only */ |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b754b7721f5..50c72943d7b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent | |||
289 | while (len--) printk("%c", *name++); | 289 | while (len--) printk("%c", *name++); |
290 | ext4fs_dirhash(de->name, de->name_len, &h); | 290 | ext4fs_dirhash(de->name, de->name_len, &h); |
291 | printk(":%x.%u ", h.hash, | 291 | printk(":%x.%u ", h.hash, |
292 | ((char *) de - base)); | 292 | (unsigned) ((char *) de - base)); |
293 | } | 293 | } |
294 | space += EXT4_DIR_REC_LEN(de->name_len); | 294 | space += EXT4_DIR_REC_LEN(de->name_len); |
295 | names++; | 295 | names++; |
@@ -922,7 +922,8 @@ restart: | |||
922 | bh = ext4_getblk(NULL, dir, b++, 0, &err); | 922 | bh = ext4_getblk(NULL, dir, b++, 0, &err); |
923 | bh_use[ra_max] = bh; | 923 | bh_use[ra_max] = bh; |
924 | if (bh) | 924 | if (bh) |
925 | ll_rw_block(READ_META, 1, &bh); | 925 | ll_rw_block(READ | REQ_META | REQ_PRIO, |
926 | 1, &bh); | ||
926 | } | 927 | } |
927 | } | 928 | } |
928 | if ((bh = bh_use[ra_ptr++]) == NULL) | 929 | if ((bh = bh_use[ra_ptr++]) == NULL) |
@@ -1013,7 +1014,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q | |||
1013 | 1014 | ||
1014 | *err = -ENOENT; | 1015 | *err = -ENOENT; |
1015 | errout: | 1016 | errout: |
1016 | dxtrace(printk(KERN_DEBUG "%s not found\n", name)); | 1017 | dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); |
1017 | dx_release (frames); | 1018 | dx_release (frames); |
1018 | return NULL; | 1019 | return NULL; |
1019 | } | 1020 | } |
@@ -1037,15 +1038,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru | |||
1037 | return ERR_PTR(-EIO); | 1038 | return ERR_PTR(-EIO); |
1038 | } | 1039 | } |
1039 | inode = ext4_iget(dir->i_sb, ino); | 1040 | inode = ext4_iget(dir->i_sb, ino); |
1040 | if (IS_ERR(inode)) { | 1041 | if (inode == ERR_PTR(-ESTALE)) { |
1041 | if (PTR_ERR(inode) == -ESTALE) { | 1042 | EXT4_ERROR_INODE(dir, |
1042 | EXT4_ERROR_INODE(dir, | 1043 | "deleted inode referenced: %u", |
1043 | "deleted inode referenced: %u", | 1044 | ino); |
1044 | ino); | 1045 | return ERR_PTR(-EIO); |
1045 | return ERR_PTR(-EIO); | ||
1046 | } else { | ||
1047 | return ERR_CAST(inode); | ||
1048 | } | ||
1049 | } | 1046 | } |
1050 | } | 1047 | } |
1051 | return d_splice_alias(inode, dentry); | 1048 | return d_splice_alias(inode, dentry); |
@@ -1589,7 +1586,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1589 | dxtrace(dx_show_index("node", frames[1].entries)); | 1586 | dxtrace(dx_show_index("node", frames[1].entries)); |
1590 | dxtrace(dx_show_index("node", | 1587 | dxtrace(dx_show_index("node", |
1591 | ((struct dx_node *) bh2->b_data)->entries)); | 1588 | ((struct dx_node *) bh2->b_data)->entries)); |
1592 | err = ext4_handle_dirty_metadata(handle, inode, bh2); | 1589 | err = ext4_handle_dirty_metadata(handle, dir, bh2); |
1593 | if (err) | 1590 | if (err) |
1594 | goto journal_error; | 1591 | goto journal_error; |
1595 | brelse (bh2); | 1592 | brelse (bh2); |
@@ -1615,7 +1612,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
1615 | if (err) | 1612 | if (err) |
1616 | goto journal_error; | 1613 | goto journal_error; |
1617 | } | 1614 | } |
1618 | err = ext4_handle_dirty_metadata(handle, inode, frames[0].bh); | 1615 | err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh); |
1619 | if (err) { | 1616 | if (err) { |
1620 | ext4_std_error(inode->i_sb, err); | 1617 | ext4_std_error(inode->i_sb, err); |
1621 | goto cleanup; | 1618 | goto cleanup; |
@@ -1866,7 +1863,7 @@ retry: | |||
1866 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); | 1863 | ext4_set_de_type(dir->i_sb, de, S_IFDIR); |
1867 | inode->i_nlink = 2; | 1864 | inode->i_nlink = 2; |
1868 | BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); | 1865 | BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); |
1869 | err = ext4_handle_dirty_metadata(handle, dir, dir_block); | 1866 | err = ext4_handle_dirty_metadata(handle, inode, dir_block); |
1870 | if (err) | 1867 | if (err) |
1871 | goto out_clear_inode; | 1868 | goto out_clear_inode; |
1872 | err = ext4_mark_inode_dirty(handle, inode); | 1869 | err = ext4_mark_inode_dirty(handle, inode); |
@@ -1989,18 +1986,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) | |||
1989 | if (!list_empty(&EXT4_I(inode)->i_orphan)) | 1986 | if (!list_empty(&EXT4_I(inode)->i_orphan)) |
1990 | goto out_unlock; | 1987 | goto out_unlock; |
1991 | 1988 | ||
1992 | /* Orphan handling is only valid for files with data blocks | 1989 | /* |
1993 | * being truncated, or files being unlinked. */ | 1990 | * Orphan handling is only valid for files with data blocks |
1994 | 1991 | * being truncated, or files being unlinked. Note that we either | |
1995 | /* @@@ FIXME: Observation from aviro: | 1992 | * hold i_mutex, or the inode can not be referenced from outside, |
1996 | * I think I can trigger J_ASSERT in ext4_orphan_add(). We block | 1993 | * so i_nlink should not be bumped due to race |
1997 | * here (on s_orphan_lock), so race with ext4_link() which might bump | ||
1998 | * ->i_nlink. For, say it, character device. Not a regular file, | ||
1999 | * not a directory, not a symlink and ->i_nlink > 0. | ||
2000 | * | ||
2001 | * tytso, 4/25/2009: I'm not sure how that could happen; | ||
2002 | * shouldn't the fs core protect us from these sort of | ||
2003 | * unlink()/link() races? | ||
2004 | */ | 1994 | */ |
2005 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 1995 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
2006 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); | 1996 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); |
@@ -2264,9 +2254,11 @@ static int ext4_symlink(struct inode *dir, | |||
2264 | /* | 2254 | /* |
2265 | * For non-fast symlinks, we just allocate inode and put it on | 2255 | * For non-fast symlinks, we just allocate inode and put it on |
2266 | * orphan list in the first transaction => we need bitmap, | 2256 | * orphan list in the first transaction => we need bitmap, |
2267 | * group descriptor, sb, inode block, quota blocks. | 2257 | * group descriptor, sb, inode block, quota blocks, and |
2258 | * possibly selinux xattr blocks. | ||
2268 | */ | 2259 | */ |
2269 | credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); | 2260 | credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + |
2261 | EXT4_XATTR_TRANS_BLOCKS; | ||
2270 | } else { | 2262 | } else { |
2271 | /* | 2263 | /* |
2272 | * Fast symlink. We have to add entry to directory | 2264 | * Fast symlink. We have to add entry to directory |
@@ -2538,7 +2530,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
2538 | PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = | 2530 | PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = |
2539 | cpu_to_le32(new_dir->i_ino); | 2531 | cpu_to_le32(new_dir->i_ino); |
2540 | BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); | 2532 | BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); |
2541 | retval = ext4_handle_dirty_metadata(handle, old_dir, dir_bh); | 2533 | retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh); |
2542 | if (retval) { | 2534 | if (retval) { |
2543 | ext4_std_error(old_dir->i_sb, retval); | 2535 | ext4_std_error(old_dir->i_sb, retval); |
2544 | goto end_rename; | 2536 | goto end_rename; |
@@ -2594,7 +2586,7 @@ const struct inode_operations ext4_dir_inode_operations = { | |||
2594 | .listxattr = ext4_listxattr, | 2586 | .listxattr = ext4_listxattr, |
2595 | .removexattr = generic_removexattr, | 2587 | .removexattr = generic_removexattr, |
2596 | #endif | 2588 | #endif |
2597 | .check_acl = ext4_check_acl, | 2589 | .get_acl = ext4_get_acl, |
2598 | .fiemap = ext4_fiemap, | 2590 | .fiemap = ext4_fiemap, |
2599 | }; | 2591 | }; |
2600 | 2592 | ||
@@ -2606,5 +2598,5 @@ const struct inode_operations ext4_special_inode_operations = { | |||
2606 | .listxattr = ext4_listxattr, | 2598 | .listxattr = ext4_listxattr, |
2607 | .removexattr = generic_removexattr, | 2599 | .removexattr = generic_removexattr, |
2608 | #endif | 2600 | #endif |
2609 | .check_acl = ext4_check_acl, | 2601 | .get_acl = ext4_get_acl, |
2610 | }; | 2602 | }; |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470..78ab854f2f9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -142,7 +142,23 @@ static void ext4_end_io_work(struct work_struct *work) | |||
142 | unsigned long flags; | 142 | unsigned long flags; |
143 | int ret; | 143 | int ret; |
144 | 144 | ||
145 | mutex_lock(&inode->i_mutex); | 145 | if (!mutex_trylock(&inode->i_mutex)) { |
146 | /* | ||
147 | * Requeue the work instead of waiting so that the work | ||
148 | * items queued after this can be processed. | ||
149 | */ | ||
150 | queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); | ||
151 | /* | ||
152 | * To prevent the ext4-dio-unwritten thread from keeping | ||
153 | * requeueing end_io requests and occupying cpu for too long, | ||
154 | * yield the cpu if it sees an end_io request that has already | ||
155 | * been requeued. | ||
156 | */ | ||
157 | if (io->flag & EXT4_IO_END_QUEUED) | ||
158 | yield(); | ||
159 | io->flag |= EXT4_IO_END_QUEUED; | ||
160 | return; | ||
161 | } | ||
146 | ret = ext4_end_io_nolock(io); | 162 | ret = ext4_end_io_nolock(io); |
147 | if (ret < 0) { | 163 | if (ret < 0) { |
148 | mutex_unlock(&inode->i_mutex); | 164 | mutex_unlock(&inode->i_mutex); |
@@ -285,11 +301,7 @@ static int io_submit_init(struct ext4_io_submit *io, | |||
285 | io_end = ext4_init_io_end(inode, GFP_NOFS); | 301 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
286 | if (!io_end) | 302 | if (!io_end) |
287 | return -ENOMEM; | 303 | return -ENOMEM; |
288 | do { | 304 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); |
289 | bio = bio_alloc(GFP_NOIO, nvecs); | ||
290 | nvecs >>= 1; | ||
291 | } while (bio == NULL); | ||
292 | |||
293 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 305 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
294 | bio->bi_bdev = bh->b_bdev; | 306 | bio->bi_bdev = bh->b_bdev; |
295 | bio->bi_private = io->io_end = io_end; | 307 | bio->bi_private = io->io_end = io_end; |
@@ -338,8 +350,10 @@ submit_and_retry: | |||
338 | if ((io_end->num_io_pages >= MAX_IO_PAGES) && | 350 | if ((io_end->num_io_pages >= MAX_IO_PAGES) && |
339 | (io_end->pages[io_end->num_io_pages-1] != io_page)) | 351 | (io_end->pages[io_end->num_io_pages-1] != io_page)) |
340 | goto submit_and_retry; | 352 | goto submit_and_retry; |
341 | if (buffer_uninit(bh)) | 353 | if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) { |
342 | io->io_end->flag |= EXT4_IO_END_UNWRITTEN; | 354 | io_end->flag |= EXT4_IO_END_UNWRITTEN; |
355 | atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten); | ||
356 | } | ||
343 | io->io_end->size += bh->b_size; | 357 | io->io_end->size += bh->b_size; |
344 | io->io_next_block++; | 358 | io->io_next_block++; |
345 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); | 359 | ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); |
@@ -387,6 +401,18 @@ int ext4_bio_write_page(struct ext4_io_submit *io, | |||
387 | 401 | ||
388 | block_end = block_start + blocksize; | 402 | block_end = block_start + blocksize; |
389 | if (block_start >= len) { | 403 | if (block_start >= len) { |
404 | /* | ||
405 | * Comments copied from block_write_full_page_endio: | ||
406 | * | ||
407 | * The page straddles i_size. It must be zeroed out on | ||
408 | * each and every writepage invocation because it may | ||
409 | * be mmapped. "A file is mapped in multiples of the | ||
410 | * page size. For a file that is not a multiple of | ||
411 | * the page size, the remaining memory is zeroed when | ||
412 | * mapped, and writes to that region are not written | ||
413 | * out to the file." | ||
414 | */ | ||
415 | zero_user_segment(page, block_start, block_end); | ||
390 | clear_buffer_dirty(bh); | 416 | clear_buffer_dirty(bh); |
391 | set_buffer_uptodate(bh); | 417 | set_buffer_uptodate(bh); |
392 | continue; | 418 | continue; |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c60c2..707d3f16f7c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -16,6 +16,35 @@ | |||
16 | 16 | ||
17 | #include "ext4_jbd2.h" | 17 | #include "ext4_jbd2.h" |
18 | 18 | ||
19 | int ext4_resize_begin(struct super_block *sb) | ||
20 | { | ||
21 | int ret = 0; | ||
22 | |||
23 | if (!capable(CAP_SYS_RESOURCE)) | ||
24 | return -EPERM; | ||
25 | |||
26 | /* | ||
27 | * We are not allowed to do online-resizing on a filesystem mounted | ||
28 | * with error, because it can destroy the filesystem easily. | ||
29 | */ | ||
30 | if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { | ||
31 | ext4_warning(sb, "There are errors in the filesystem, " | ||
32 | "so online resizing is not allowed\n"); | ||
33 | return -EPERM; | ||
34 | } | ||
35 | |||
36 | if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) | ||
37 | ret = -EBUSY; | ||
38 | |||
39 | return ret; | ||
40 | } | ||
41 | |||
42 | void ext4_resize_end(struct super_block *sb) | ||
43 | { | ||
44 | clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); | ||
45 | smp_mb__after_clear_bit(); | ||
46 | } | ||
47 | |||
19 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) | 48 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) |
20 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) | 49 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) |
21 | 50 | ||
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
118 | brelse(bh); | 147 | brelse(bh); |
119 | bh = ERR_PTR(err); | 148 | bh = ERR_PTR(err); |
120 | } else { | 149 | } else { |
121 | lock_buffer(bh); | ||
122 | memset(bh->b_data, 0, sb->s_blocksize); | 150 | memset(bh->b_data, 0, sb->s_blocksize); |
123 | set_buffer_uptodate(bh); | 151 | set_buffer_uptodate(bh); |
124 | unlock_buffer(bh); | ||
125 | } | 152 | } |
126 | 153 | ||
127 | return bh; | 154 | return bh; |
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
132 | * If that fails, restart the transaction & regain write access for the | 159 | * If that fails, restart the transaction & regain write access for the |
133 | * buffer head which is used for block_bitmap modifications. | 160 | * buffer head which is used for block_bitmap modifications. |
134 | */ | 161 | */ |
135 | static int extend_or_restart_transaction(handle_t *handle, int thresh, | 162 | static int extend_or_restart_transaction(handle_t *handle, int thresh) |
136 | struct buffer_head *bh) | ||
137 | { | 163 | { |
138 | int err; | 164 | int err; |
139 | 165 | ||
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, | |||
144 | if (err < 0) | 170 | if (err < 0) |
145 | return err; | 171 | return err; |
146 | if (err) { | 172 | if (err) { |
147 | if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) | 173 | err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); |
148 | return err; | 174 | if (err) |
149 | if ((err = ext4_journal_get_write_access(handle, bh))) | ||
150 | return err; | 175 | return err; |
151 | } | 176 | } |
152 | 177 | ||
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
181 | if (IS_ERR(handle)) | 206 | if (IS_ERR(handle)) |
182 | return PTR_ERR(handle); | 207 | return PTR_ERR(handle); |
183 | 208 | ||
184 | mutex_lock(&sbi->s_resize_lock); | 209 | BUG_ON(input->group != sbi->s_groups_count); |
185 | if (input->group != sbi->s_groups_count) { | ||
186 | err = -EBUSY; | ||
187 | goto exit_journal; | ||
188 | } | ||
189 | |||
190 | if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { | ||
191 | err = PTR_ERR(bh); | ||
192 | goto exit_journal; | ||
193 | } | ||
194 | |||
195 | if (ext4_bg_has_super(sb, input->group)) { | ||
196 | ext4_debug("mark backup superblock %#04llx (+0)\n", start); | ||
197 | ext4_set_bit(0, bh->b_data); | ||
198 | } | ||
199 | 210 | ||
200 | /* Copy all of the GDT blocks into the backup in this group */ | 211 | /* Copy all of the GDT blocks into the backup in this group */ |
201 | for (i = 0, bit = 1, block = start + 1; | 212 | for (i = 0, bit = 1, block = start + 1; |
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
203 | struct buffer_head *gdb; | 214 | struct buffer_head *gdb; |
204 | 215 | ||
205 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); | 216 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); |
206 | 217 | err = extend_or_restart_transaction(handle, 1); | |
207 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 218 | if (err) |
208 | goto exit_bh; | 219 | goto exit_journal; |
209 | 220 | ||
210 | gdb = sb_getblk(sb, block); | 221 | gdb = sb_getblk(sb, block); |
211 | if (!gdb) { | 222 | if (!gdb) { |
212 | err = -EIO; | 223 | err = -EIO; |
213 | goto exit_bh; | 224 | goto exit_journal; |
214 | } | 225 | } |
215 | if ((err = ext4_journal_get_write_access(handle, gdb))) { | 226 | if ((err = ext4_journal_get_write_access(handle, gdb))) { |
216 | brelse(gdb); | 227 | brelse(gdb); |
217 | goto exit_bh; | 228 | goto exit_journal; |
218 | } | 229 | } |
219 | lock_buffer(gdb); | ||
220 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); | 230 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); |
221 | set_buffer_uptodate(gdb); | 231 | set_buffer_uptodate(gdb); |
222 | unlock_buffer(gdb); | ||
223 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); | 232 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); |
224 | if (unlikely(err)) { | 233 | if (unlikely(err)) { |
225 | brelse(gdb); | 234 | brelse(gdb); |
226 | goto exit_bh; | 235 | goto exit_journal; |
227 | } | 236 | } |
228 | ext4_set_bit(bit, bh->b_data); | ||
229 | brelse(gdb); | 237 | brelse(gdb); |
230 | } | 238 | } |
231 | 239 | ||
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
235 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, | 243 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, |
236 | GFP_NOFS); | 244 | GFP_NOFS); |
237 | if (err) | 245 | if (err) |
238 | goto exit_bh; | 246 | goto exit_journal; |
239 | for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) | 247 | |
240 | ext4_set_bit(bit, bh->b_data); | 248 | err = extend_or_restart_transaction(handle, 2); |
249 | if (err) | ||
250 | goto exit_journal; | ||
251 | |||
252 | bh = bclean(handle, sb, input->block_bitmap); | ||
253 | if (IS_ERR(bh)) { | ||
254 | err = PTR_ERR(bh); | ||
255 | goto exit_journal; | ||
256 | } | ||
257 | |||
258 | if (ext4_bg_has_super(sb, input->group)) { | ||
259 | ext4_debug("mark backup group tables %#04llx (+0)\n", start); | ||
260 | ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); | ||
261 | } | ||
241 | 262 | ||
242 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, | 263 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, |
243 | input->block_bitmap - start); | 264 | input->block_bitmap - start); |
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
253 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); | 274 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); |
254 | if (err) | 275 | if (err) |
255 | goto exit_bh; | 276 | goto exit_bh; |
256 | for (i = 0, bit = input->inode_table - start; | 277 | ext4_set_bits(bh->b_data, input->inode_table - start, |
257 | i < sbi->s_itb_per_group; i++, bit++) | 278 | sbi->s_itb_per_group); |
258 | ext4_set_bit(bit, bh->b_data); | ||
259 | 279 | ||
260 | if ((err = extend_or_restart_transaction(handle, 2, bh))) | ||
261 | goto exit_bh; | ||
262 | 280 | ||
263 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, | 281 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, |
264 | bh->b_data); | 282 | bh->b_data); |
@@ -285,7 +303,6 @@ exit_bh: | |||
285 | brelse(bh); | 303 | brelse(bh); |
286 | 304 | ||
287 | exit_journal: | 305 | exit_journal: |
288 | mutex_unlock(&sbi->s_resize_lock); | ||
289 | if ((err2 = ext4_journal_stop(handle)) && !err) | 306 | if ((err2 = ext4_journal_stop(handle)) && !err) |
290 | err = err2; | 307 | err = err2; |
291 | 308 | ||
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, | |||
377 | * fail once we start modifying the data on disk, because JBD has no rollback. | 394 | * fail once we start modifying the data on disk, because JBD has no rollback. |
378 | */ | 395 | */ |
379 | static int add_new_gdb(handle_t *handle, struct inode *inode, | 396 | static int add_new_gdb(handle_t *handle, struct inode *inode, |
380 | struct ext4_new_group_data *input, | 397 | ext4_group_t group) |
381 | struct buffer_head **primary) | ||
382 | { | 398 | { |
383 | struct super_block *sb = inode->i_sb; | 399 | struct super_block *sb = inode->i_sb; |
384 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | 400 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
385 | unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | 401 | unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); |
386 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; | 402 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; |
387 | struct buffer_head **o_group_desc, **n_group_desc; | 403 | struct buffer_head **o_group_desc, **n_group_desc; |
388 | struct buffer_head *dind; | 404 | struct buffer_head *dind; |
405 | struct buffer_head *gdb_bh; | ||
389 | int gdbackups; | 406 | int gdbackups; |
390 | struct ext4_iloc iloc; | 407 | struct ext4_iloc iloc; |
391 | __le32 *data; | 408 | __le32 *data; |
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
408 | return -EPERM; | 425 | return -EPERM; |
409 | } | 426 | } |
410 | 427 | ||
411 | *primary = sb_bread(sb, gdblock); | 428 | gdb_bh = sb_bread(sb, gdblock); |
412 | if (!*primary) | 429 | if (!gdb_bh) |
413 | return -EIO; | 430 | return -EIO; |
414 | 431 | ||
415 | if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { | 432 | gdbackups = verify_reserved_gdb(sb, gdb_bh); |
433 | if (gdbackups < 0) { | ||
416 | err = gdbackups; | 434 | err = gdbackups; |
417 | goto exit_bh; | 435 | goto exit_bh; |
418 | } | 436 | } |
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
427 | data = (__le32 *)dind->b_data; | 445 | data = (__le32 *)dind->b_data; |
428 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { | 446 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { |
429 | ext4_warning(sb, "new group %u GDT block %llu not reserved", | 447 | ext4_warning(sb, "new group %u GDT block %llu not reserved", |
430 | input->group, gdblock); | 448 | group, gdblock); |
431 | err = -EINVAL; | 449 | err = -EINVAL; |
432 | goto exit_dind; | 450 | goto exit_dind; |
433 | } | 451 | } |
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
436 | if (unlikely(err)) | 454 | if (unlikely(err)) |
437 | goto exit_dind; | 455 | goto exit_dind; |
438 | 456 | ||
439 | err = ext4_journal_get_write_access(handle, *primary); | 457 | err = ext4_journal_get_write_access(handle, gdb_bh); |
440 | if (unlikely(err)) | 458 | if (unlikely(err)) |
441 | goto exit_sbh; | 459 | goto exit_sbh; |
442 | 460 | ||
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
449 | if (unlikely(err)) | 467 | if (unlikely(err)) |
450 | goto exit_dindj; | 468 | goto exit_dindj; |
451 | 469 | ||
452 | n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), | 470 | n_group_desc = ext4_kvmalloc((gdb_num + 1) * |
453 | GFP_NOFS); | 471 | sizeof(struct buffer_head *), |
472 | GFP_NOFS); | ||
454 | if (!n_group_desc) { | 473 | if (!n_group_desc) { |
455 | err = -ENOMEM; | 474 | err = -ENOMEM; |
456 | ext4_warning(sb, | 475 | ext4_warning(sb, "not enough memory for %lu groups", |
457 | "not enough memory for %lu groups", gdb_num + 1); | 476 | gdb_num + 1); |
458 | goto exit_inode; | 477 | goto exit_inode; |
459 | } | 478 | } |
460 | 479 | ||
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
475 | } | 494 | } |
476 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; | 495 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; |
477 | ext4_mark_iloc_dirty(handle, inode, &iloc); | 496 | ext4_mark_iloc_dirty(handle, inode, &iloc); |
478 | memset((*primary)->b_data, 0, sb->s_blocksize); | 497 | memset(gdb_bh->b_data, 0, sb->s_blocksize); |
479 | err = ext4_handle_dirty_metadata(handle, NULL, *primary); | 498 | err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); |
480 | if (unlikely(err)) { | 499 | if (unlikely(err)) { |
481 | ext4_std_error(sb, err); | 500 | ext4_std_error(sb, err); |
482 | goto exit_inode; | 501 | goto exit_inode; |
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
486 | o_group_desc = EXT4_SB(sb)->s_group_desc; | 505 | o_group_desc = EXT4_SB(sb)->s_group_desc; |
487 | memcpy(n_group_desc, o_group_desc, | 506 | memcpy(n_group_desc, o_group_desc, |
488 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); | 507 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); |
489 | n_group_desc[gdb_num] = *primary; | 508 | n_group_desc[gdb_num] = gdb_bh; |
490 | EXT4_SB(sb)->s_group_desc = n_group_desc; | 509 | EXT4_SB(sb)->s_group_desc = n_group_desc; |
491 | EXT4_SB(sb)->s_gdb_count++; | 510 | EXT4_SB(sb)->s_gdb_count++; |
492 | kfree(o_group_desc); | 511 | ext4_kvfree(o_group_desc); |
493 | 512 | ||
494 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); | 513 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); |
495 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); | 514 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
499 | return err; | 518 | return err; |
500 | 519 | ||
501 | exit_inode: | 520 | exit_inode: |
521 | ext4_kvfree(n_group_desc); | ||
502 | /* ext4_handle_release_buffer(handle, iloc.bh); */ | 522 | /* ext4_handle_release_buffer(handle, iloc.bh); */ |
503 | brelse(iloc.bh); | 523 | brelse(iloc.bh); |
504 | exit_dindj: | 524 | exit_dindj: |
@@ -508,7 +528,7 @@ exit_sbh: | |||
508 | exit_dind: | 528 | exit_dind: |
509 | brelse(dind); | 529 | brelse(dind); |
510 | exit_bh: | 530 | exit_bh: |
511 | brelse(*primary); | 531 | brelse(gdb_bh); |
512 | 532 | ||
513 | ext4_debug("leaving with error %d\n", err); | 533 | ext4_debug("leaving with error %d\n", err); |
514 | return err; | 534 | return err; |
@@ -528,7 +548,7 @@ exit_bh: | |||
528 | * backup GDT blocks are stored in their reserved primary GDT block. | 548 | * backup GDT blocks are stored in their reserved primary GDT block. |
529 | */ | 549 | */ |
530 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | 550 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, |
531 | struct ext4_new_group_data *input) | 551 | ext4_group_t group) |
532 | { | 552 | { |
533 | struct super_block *sb = inode->i_sb; | 553 | struct super_block *sb = inode->i_sb; |
534 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); | 554 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); |
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | |||
599 | * Finally we can add each of the reserved backup GDT blocks from | 619 | * Finally we can add each of the reserved backup GDT blocks from |
600 | * the new group to its reserved primary GDT block. | 620 | * the new group to its reserved primary GDT block. |
601 | */ | 621 | */ |
602 | blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); | 622 | blk = group * EXT4_BLOCKS_PER_GROUP(sb); |
603 | for (i = 0; i < reserved_gdb; i++) { | 623 | for (i = 0; i < reserved_gdb; i++) { |
604 | int err2; | 624 | int err2; |
605 | data = (__le32 *)primary[i]->b_data; | 625 | data = (__le32 *)primary[i]->b_data; |
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
799 | goto exit_put; | 819 | goto exit_put; |
800 | } | 820 | } |
801 | 821 | ||
802 | mutex_lock(&sbi->s_resize_lock); | ||
803 | if (input->group != sbi->s_groups_count) { | ||
804 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
805 | err = -EBUSY; | ||
806 | goto exit_journal; | ||
807 | } | ||
808 | |||
809 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) | 822 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) |
810 | goto exit_journal; | 823 | goto exit_journal; |
811 | 824 | ||
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
820 | if ((err = ext4_journal_get_write_access(handle, primary))) | 833 | if ((err = ext4_journal_get_write_access(handle, primary))) |
821 | goto exit_journal; | 834 | goto exit_journal; |
822 | 835 | ||
823 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && | 836 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { |
824 | (err = reserve_backup_gdb(handle, inode, input))) | 837 | err = reserve_backup_gdb(handle, inode, input->group); |
838 | if (err) | ||
839 | goto exit_journal; | ||
840 | } | ||
841 | } else { | ||
842 | /* | ||
843 | * Note that we can access new group descriptor block safely | ||
844 | * only if add_new_gdb() succeeds. | ||
845 | */ | ||
846 | err = add_new_gdb(handle, inode, input->group); | ||
847 | if (err) | ||
825 | goto exit_journal; | 848 | goto exit_journal; |
826 | } else if ((err = add_new_gdb(handle, inode, input, &primary))) | 849 | primary = sbi->s_group_desc[gdb_num]; |
827 | goto exit_journal; | 850 | } |
828 | 851 | ||
829 | /* | 852 | /* |
830 | * OK, now we've set up the new group. Time to make it active. | 853 | * OK, now we've set up the new group. Time to make it active. |
831 | * | 854 | * |
832 | * We do not lock all allocations via s_resize_lock | ||
833 | * so we have to be safe wrt. concurrent accesses the group | 855 | * so we have to be safe wrt. concurrent accesses the group |
834 | * data. So we need to be careful to set all of the relevant | 856 | * data. So we need to be careful to set all of the relevant |
835 | * group descriptor data etc. *before* we enable the group. | 857 | * group descriptor data etc. *before* we enable the group. |
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
886 | * | 908 | * |
887 | * The precise rules we use are: | 909 | * The precise rules we use are: |
888 | * | 910 | * |
889 | * * Writers of s_groups_count *must* hold s_resize_lock | ||
890 | * AND | ||
891 | * * Writers must perform a smp_wmb() after updating all dependent | 911 | * * Writers must perform a smp_wmb() after updating all dependent |
892 | * data and before modifying the groups count | 912 | * data and before modifying the groups count |
893 | * | 913 | * |
894 | * * Readers must hold s_resize_lock over the access | ||
895 | * OR | ||
896 | * * Readers must perform an smp_rmb() after reading the groups count | 914 | * * Readers must perform an smp_rmb() after reading the groups count |
897 | * and before reading any dependent data. | 915 | * and before reading any dependent data. |
898 | * | 916 | * |
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
937 | ext4_handle_dirty_super(handle, sb); | 955 | ext4_handle_dirty_super(handle, sb); |
938 | 956 | ||
939 | exit_journal: | 957 | exit_journal: |
940 | mutex_unlock(&sbi->s_resize_lock); | ||
941 | if ((err2 = ext4_journal_stop(handle)) && !err) | 958 | if ((err2 = ext4_journal_stop(handle)) && !err) |
942 | err = err2; | 959 | err = err2; |
943 | if (!err) { | 960 | if (!err && primary) { |
944 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, | 961 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, |
945 | sizeof(struct ext4_super_block)); | 962 | sizeof(struct ext4_super_block)); |
946 | update_backups(sb, primary->b_blocknr, primary->b_data, | 963 | update_backups(sb, primary->b_blocknr, primary->b_data, |
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
969 | ext4_grpblk_t add; | 986 | ext4_grpblk_t add; |
970 | struct buffer_head *bh; | 987 | struct buffer_head *bh; |
971 | handle_t *handle; | 988 | handle_t *handle; |
972 | int err; | 989 | int err, err2; |
973 | ext4_group_t group; | 990 | ext4_group_t group; |
974 | 991 | ||
975 | /* We don't need to worry about locking wrt other resizers just | ||
976 | * yet: we're going to revalidate es->s_blocks_count after | ||
977 | * taking the s_resize_lock below. */ | ||
978 | o_blocks_count = ext4_blocks_count(es); | 992 | o_blocks_count = ext4_blocks_count(es); |
979 | 993 | ||
980 | if (test_opt(sb, DEBUG)) | 994 | if (test_opt(sb, DEBUG)) |
981 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", | 995 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", |
982 | o_blocks_count, n_blocks_count); | 996 | o_blocks_count, n_blocks_count); |
983 | 997 | ||
984 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) | 998 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) |
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
995 | 1009 | ||
996 | if (n_blocks_count < o_blocks_count) { | 1010 | if (n_blocks_count < o_blocks_count) { |
997 | ext4_warning(sb, "can't shrink FS - resize aborted"); | 1011 | ext4_warning(sb, "can't shrink FS - resize aborted"); |
998 | return -EBUSY; | 1012 | return -EINVAL; |
999 | } | 1013 | } |
1000 | 1014 | ||
1001 | /* Handle the remaining blocks in the last group only. */ | 1015 | /* Handle the remaining blocks in the last group only. */ |
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1038 | goto exit_put; | 1052 | goto exit_put; |
1039 | } | 1053 | } |
1040 | 1054 | ||
1041 | mutex_lock(&EXT4_SB(sb)->s_resize_lock); | ||
1042 | if (o_blocks_count != ext4_blocks_count(es)) { | ||
1043 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
1044 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1045 | ext4_journal_stop(handle); | ||
1046 | err = -EBUSY; | ||
1047 | goto exit_put; | ||
1048 | } | ||
1049 | |||
1050 | if ((err = ext4_journal_get_write_access(handle, | 1055 | if ((err = ext4_journal_get_write_access(handle, |
1051 | EXT4_SB(sb)->s_sbh))) { | 1056 | EXT4_SB(sb)->s_sbh))) { |
1052 | ext4_warning(sb, "error %d on journal write access", err); | 1057 | ext4_warning(sb, "error %d on journal write access", err); |
1053 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1054 | ext4_journal_stop(handle); | 1058 | ext4_journal_stop(handle); |
1055 | goto exit_put; | 1059 | goto exit_put; |
1056 | } | 1060 | } |
1057 | ext4_blocks_count_set(es, o_blocks_count + add); | 1061 | ext4_blocks_count_set(es, o_blocks_count + add); |
1058 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1059 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, | 1062 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, |
1060 | o_blocks_count + add); | 1063 | o_blocks_count + add); |
1061 | /* We add the blocks to the bitmap and set the group need init bit */ | 1064 | /* We add the blocks to the bitmap and set the group need init bit */ |
1062 | ext4_add_groupblocks(handle, sb, o_blocks_count, add); | 1065 | err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); |
1063 | ext4_handle_dirty_super(handle, sb); | 1066 | ext4_handle_dirty_super(handle, sb); |
1064 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | 1067 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, |
1065 | o_blocks_count + add); | 1068 | o_blocks_count + add); |
1066 | if ((err = ext4_journal_stop(handle))) | 1069 | err2 = ext4_journal_stop(handle); |
1070 | if (!err && err2) | ||
1071 | err = err2; | ||
1072 | |||
1073 | if (err) | ||
1067 | goto exit_put; | 1074 | goto exit_put; |
1068 | 1075 | ||
1069 | if (test_opt(sb, DEBUG)) | 1076 | if (test_opt(sb, DEBUG)) |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b..40bfe8dc502 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { | |||
110 | #define IS_EXT3_SB(sb) (0) | 110 | #define IS_EXT3_SB(sb) (0) |
111 | #endif | 111 | #endif |
112 | 112 | ||
113 | void *ext4_kvmalloc(size_t size, gfp_t flags) | ||
114 | { | ||
115 | void *ret; | ||
116 | |||
117 | ret = kmalloc(size, flags); | ||
118 | if (!ret) | ||
119 | ret = __vmalloc(size, flags, PAGE_KERNEL); | ||
120 | return ret; | ||
121 | } | ||
122 | |||
123 | void *ext4_kvzalloc(size_t size, gfp_t flags) | ||
124 | { | ||
125 | void *ret; | ||
126 | |||
127 | ret = kzalloc(size, flags); | ||
128 | if (!ret) | ||
129 | ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); | ||
130 | return ret; | ||
131 | } | ||
132 | |||
133 | void ext4_kvfree(void *ptr) | ||
134 | { | ||
135 | if (is_vmalloc_addr(ptr)) | ||
136 | vfree(ptr); | ||
137 | else | ||
138 | kfree(ptr); | ||
139 | |||
140 | } | ||
141 | |||
113 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, | 142 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, |
114 | struct ext4_group_desc *bg) | 143 | struct ext4_group_desc *bg) |
115 | { | 144 | { |
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) | |||
269 | journal_t *journal; | 298 | journal_t *journal; |
270 | handle_t *handle; | 299 | handle_t *handle; |
271 | 300 | ||
301 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); | ||
272 | if (sb->s_flags & MS_RDONLY) | 302 | if (sb->s_flags & MS_RDONLY) |
273 | return ERR_PTR(-EROFS); | 303 | return ERR_PTR(-EROFS); |
274 | 304 | ||
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb) | |||
789 | 819 | ||
790 | for (i = 0; i < sbi->s_gdb_count; i++) | 820 | for (i = 0; i < sbi->s_gdb_count; i++) |
791 | brelse(sbi->s_group_desc[i]); | 821 | brelse(sbi->s_group_desc[i]); |
792 | kfree(sbi->s_group_desc); | 822 | ext4_kvfree(sbi->s_group_desc); |
793 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 823 | ext4_kvfree(sbi->s_flex_groups); |
794 | vfree(sbi->s_flex_groups); | ||
795 | else | ||
796 | kfree(sbi->s_flex_groups); | ||
797 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 824 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
798 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 825 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
799 | percpu_counter_destroy(&sbi->s_dirs_counter); | 826 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -892,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head) | |||
892 | 919 | ||
893 | static void ext4_destroy_inode(struct inode *inode) | 920 | static void ext4_destroy_inode(struct inode *inode) |
894 | { | 921 | { |
895 | ext4_ioend_wait(inode); | ||
896 | if (!list_empty(&(EXT4_I(inode)->i_orphan))) { | 922 | if (!list_empty(&(EXT4_I(inode)->i_orphan))) { |
897 | ext4_msg(inode->i_sb, KERN_ERR, | 923 | ext4_msg(inode->i_sb, KERN_ERR, |
898 | "Inode %lu (%p): orphan list check failed!", | 924 | "Inode %lu (%p): orphan list check failed!", |
@@ -1114,9 +1140,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1114 | seq_puts(seq, ",block_validity"); | 1140 | seq_puts(seq, ",block_validity"); |
1115 | 1141 | ||
1116 | if (!test_opt(sb, INIT_INODE_TABLE)) | 1142 | if (!test_opt(sb, INIT_INODE_TABLE)) |
1117 | seq_puts(seq, ",noinit_inode_table"); | 1143 | seq_puts(seq, ",noinit_itable"); |
1118 | else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) | 1144 | else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT) |
1119 | seq_printf(seq, ",init_inode_table=%u", | 1145 | seq_printf(seq, ",init_itable=%u", |
1120 | (unsigned) sbi->s_li_wait_mult); | 1146 | (unsigned) sbi->s_li_wait_mult); |
1121 | 1147 | ||
1122 | ext4_show_quota_options(seq, sb); | 1148 | ext4_show_quota_options(seq, sb); |
@@ -1292,8 +1318,7 @@ enum { | |||
1292 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, | 1318 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, |
1293 | Opt_inode_readahead_blks, Opt_journal_ioprio, | 1319 | Opt_inode_readahead_blks, Opt_journal_ioprio, |
1294 | Opt_dioread_nolock, Opt_dioread_lock, | 1320 | Opt_dioread_nolock, Opt_dioread_lock, |
1295 | Opt_discard, Opt_nodiscard, | 1321 | Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, |
1296 | Opt_init_inode_table, Opt_noinit_inode_table, | ||
1297 | }; | 1322 | }; |
1298 | 1323 | ||
1299 | static const match_table_t tokens = { | 1324 | static const match_table_t tokens = { |
@@ -1366,9 +1391,9 @@ static const match_table_t tokens = { | |||
1366 | {Opt_dioread_lock, "dioread_lock"}, | 1391 | {Opt_dioread_lock, "dioread_lock"}, |
1367 | {Opt_discard, "discard"}, | 1392 | {Opt_discard, "discard"}, |
1368 | {Opt_nodiscard, "nodiscard"}, | 1393 | {Opt_nodiscard, "nodiscard"}, |
1369 | {Opt_init_inode_table, "init_itable=%u"}, | 1394 | {Opt_init_itable, "init_itable=%u"}, |
1370 | {Opt_init_inode_table, "init_itable"}, | 1395 | {Opt_init_itable, "init_itable"}, |
1371 | {Opt_noinit_inode_table, "noinit_itable"}, | 1396 | {Opt_noinit_itable, "noinit_itable"}, |
1372 | {Opt_err, NULL}, | 1397 | {Opt_err, NULL}, |
1373 | }; | 1398 | }; |
1374 | 1399 | ||
@@ -1845,7 +1870,7 @@ set_qf_format: | |||
1845 | case Opt_dioread_lock: | 1870 | case Opt_dioread_lock: |
1846 | clear_opt(sb, DIOREAD_NOLOCK); | 1871 | clear_opt(sb, DIOREAD_NOLOCK); |
1847 | break; | 1872 | break; |
1848 | case Opt_init_inode_table: | 1873 | case Opt_init_itable: |
1849 | set_opt(sb, INIT_INODE_TABLE); | 1874 | set_opt(sb, INIT_INODE_TABLE); |
1850 | if (args[0].from) { | 1875 | if (args[0].from) { |
1851 | if (match_int(&args[0], &option)) | 1876 | if (match_int(&args[0], &option)) |
@@ -1856,7 +1881,7 @@ set_qf_format: | |||
1856 | return 0; | 1881 | return 0; |
1857 | sbi->s_li_wait_mult = option; | 1882 | sbi->s_li_wait_mult = option; |
1858 | break; | 1883 | break; |
1859 | case Opt_noinit_inode_table: | 1884 | case Opt_noinit_itable: |
1860 | clear_opt(sb, INIT_INODE_TABLE); | 1885 | clear_opt(sb, INIT_INODE_TABLE); |
1861 | break; | 1886 | break; |
1862 | default: | 1887 | default: |
@@ -1959,32 +1984,27 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1959 | struct ext4_group_desc *gdp = NULL; | 1984 | struct ext4_group_desc *gdp = NULL; |
1960 | ext4_group_t flex_group_count; | 1985 | ext4_group_t flex_group_count; |
1961 | ext4_group_t flex_group; | 1986 | ext4_group_t flex_group; |
1962 | int groups_per_flex = 0; | 1987 | unsigned int groups_per_flex = 0; |
1963 | size_t size; | 1988 | size_t size; |
1964 | int i; | 1989 | int i; |
1965 | 1990 | ||
1966 | sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | 1991 | sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; |
1967 | groups_per_flex = 1 << sbi->s_log_groups_per_flex; | 1992 | if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) { |
1968 | |||
1969 | if (groups_per_flex < 2) { | ||
1970 | sbi->s_log_groups_per_flex = 0; | 1993 | sbi->s_log_groups_per_flex = 0; |
1971 | return 1; | 1994 | return 1; |
1972 | } | 1995 | } |
1996 | groups_per_flex = 1 << sbi->s_log_groups_per_flex; | ||
1973 | 1997 | ||
1974 | /* We allocate both existing and potentially added groups */ | 1998 | /* We allocate both existing and potentially added groups */ |
1975 | flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + | 1999 | flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + |
1976 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << | 2000 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << |
1977 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; | 2001 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; |
1978 | size = flex_group_count * sizeof(struct flex_groups); | 2002 | size = flex_group_count * sizeof(struct flex_groups); |
1979 | sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); | 2003 | sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); |
1980 | if (sbi->s_flex_groups == NULL) { | 2004 | if (sbi->s_flex_groups == NULL) { |
1981 | sbi->s_flex_groups = vzalloc(size); | 2005 | ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", |
1982 | if (sbi->s_flex_groups == NULL) { | 2006 | flex_group_count); |
1983 | ext4_msg(sb, KERN_ERR, | 2007 | goto failed; |
1984 | "not enough memory for %u flex groups", | ||
1985 | flex_group_count); | ||
1986 | goto failed; | ||
1987 | } | ||
1988 | } | 2008 | } |
1989 | 2009 | ||
1990 | for (i = 0; i < sbi->s_groups_count; i++) { | 2010 | for (i = 0; i < sbi->s_groups_count; i++) { |
@@ -2383,17 +2403,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | |||
2383 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); | 2403 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); |
2384 | unsigned long stripe_width = | 2404 | unsigned long stripe_width = |
2385 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); | 2405 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); |
2406 | int ret; | ||
2386 | 2407 | ||
2387 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) | 2408 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) |
2388 | return sbi->s_stripe; | 2409 | ret = sbi->s_stripe; |
2389 | 2410 | else if (stripe_width <= sbi->s_blocks_per_group) | |
2390 | if (stripe_width <= sbi->s_blocks_per_group) | 2411 | ret = stripe_width; |
2391 | return stripe_width; | 2412 | else if (stride <= sbi->s_blocks_per_group) |
2413 | ret = stride; | ||
2414 | else | ||
2415 | ret = 0; | ||
2392 | 2416 | ||
2393 | if (stride <= sbi->s_blocks_per_group) | 2417 | /* |
2394 | return stride; | 2418 | * If the stripe width is 1, this makes no sense and |
2419 | * we set it to 0 to turn off stripe handling code. | ||
2420 | */ | ||
2421 | if (ret <= 1) | ||
2422 | ret = 0; | ||
2395 | 2423 | ||
2396 | return 0; | 2424 | return ret; |
2397 | } | 2425 | } |
2398 | 2426 | ||
2399 | /* sysfs supprt */ | 2427 | /* sysfs supprt */ |
@@ -3315,8 +3343,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3315 | sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); | 3343 | sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); |
3316 | if (sbi->s_inodes_per_block == 0) | 3344 | if (sbi->s_inodes_per_block == 0) |
3317 | goto cantfind_ext4; | 3345 | goto cantfind_ext4; |
3318 | sbi->s_itb_per_group = sbi->s_inodes_per_group / | 3346 | sbi->s_itb_per_group = DIV_ROUND_UP(sbi->s_inodes_per_group, |
3319 | sbi->s_inodes_per_block; | 3347 | sbi->s_inodes_per_block); |
3320 | sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); | 3348 | sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); |
3321 | sbi->s_sbh = bh; | 3349 | sbi->s_sbh = bh; |
3322 | sbi->s_mount_state = le16_to_cpu(es->s_state); | 3350 | sbi->s_mount_state = le16_to_cpu(es->s_state); |
@@ -3408,8 +3436,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3408 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); | 3436 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); |
3409 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | 3437 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / |
3410 | EXT4_DESC_PER_BLOCK(sb); | 3438 | EXT4_DESC_PER_BLOCK(sb); |
3411 | sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), | 3439 | sbi->s_group_desc = ext4_kvmalloc(db_count * |
3412 | GFP_KERNEL); | 3440 | sizeof(struct buffer_head *), |
3441 | GFP_KERNEL); | ||
3413 | if (sbi->s_group_desc == NULL) { | 3442 | if (sbi->s_group_desc == NULL) { |
3414 | ext4_msg(sb, KERN_ERR, "not enough memory"); | 3443 | ext4_msg(sb, KERN_ERR, "not enough memory"); |
3415 | goto failed_mount; | 3444 | goto failed_mount; |
@@ -3491,7 +3520,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3491 | 3520 | ||
3492 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ | 3521 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ |
3493 | mutex_init(&sbi->s_orphan_lock); | 3522 | mutex_init(&sbi->s_orphan_lock); |
3494 | mutex_init(&sbi->s_resize_lock); | 3523 | sbi->s_resize_flags = 0; |
3495 | 3524 | ||
3496 | sb->s_root = NULL; | 3525 | sb->s_root = NULL; |
3497 | 3526 | ||
@@ -3741,12 +3770,8 @@ failed_mount_wq: | |||
3741 | } | 3770 | } |
3742 | failed_mount3: | 3771 | failed_mount3: |
3743 | del_timer(&sbi->s_err_report); | 3772 | del_timer(&sbi->s_err_report); |
3744 | if (sbi->s_flex_groups) { | 3773 | if (sbi->s_flex_groups) |
3745 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 3774 | ext4_kvfree(sbi->s_flex_groups); |
3746 | vfree(sbi->s_flex_groups); | ||
3747 | else | ||
3748 | kfree(sbi->s_flex_groups); | ||
3749 | } | ||
3750 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 3775 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
3751 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 3776 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
3752 | percpu_counter_destroy(&sbi->s_dirs_counter); | 3777 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -3756,7 +3781,7 @@ failed_mount3: | |||
3756 | failed_mount2: | 3781 | failed_mount2: |
3757 | for (i = 0; i < db_count; i++) | 3782 | for (i = 0; i < db_count; i++) |
3758 | brelse(sbi->s_group_desc[i]); | 3783 | brelse(sbi->s_group_desc[i]); |
3759 | kfree(sbi->s_group_desc); | 3784 | ext4_kvfree(sbi->s_group_desc); |
3760 | failed_mount: | 3785 | failed_mount: |
3761 | if (sbi->s_proc) { | 3786 | if (sbi->s_proc) { |
3762 | remove_proc_entry(sb->s_id, ext4_proc_root); | 3787 | remove_proc_entry(sb->s_id, ext4_proc_root); |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 00000000000..011ba6670d9 --- /dev/null +++ b/fs/ext4/truncate.h | |||
@@ -0,0 +1,43 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/truncate.h | ||
3 | * | ||
4 | * Common inline functions needed for truncate support | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Truncate blocks that were not used by write. We have to truncate the | ||
9 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
10 | */ | ||
11 | static inline void ext4_truncate_failed_write(struct inode *inode) | ||
12 | { | ||
13 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
14 | ext4_truncate(inode); | ||
15 | } | ||
16 | |||
17 | /* | ||
18 | * Work out how many blocks we need to proceed with the next chunk of a | ||
19 | * truncate transaction. | ||
20 | */ | ||
21 | static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) | ||
22 | { | ||
23 | ext4_lblk_t needed; | ||
24 | |||
25 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
26 | |||
27 | /* Give ourselves just enough room to cope with inodes in which | ||
28 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
29 | * which resulted in random data in an inode which looked enough | ||
30 | * like a regular file for ext4 to try to delete it. Things | ||
31 | * will go a bit crazy if that happens, but at least we should | ||
32 | * try not to panic the whole kernel. */ | ||
33 | if (needed < 2) | ||
34 | needed = 2; | ||
35 | |||
36 | /* But we need to bound the transaction so we don't overflow the | ||
37 | * journal. */ | ||
38 | if (needed > EXT4_MAX_TRANS_DATA) | ||
39 | needed = EXT4_MAX_TRANS_DATA; | ||
40 | |||
41 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
42 | } | ||
43 | |||
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index c757adc9725..19fe4e3d39e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -820,8 +820,14 @@ inserted: | |||
820 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 820 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
821 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | 821 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; |
822 | 822 | ||
823 | /* | ||
824 | * take i_data_sem because we will test | ||
825 | * i_delalloc_reserved_flag in ext4_mb_new_blocks | ||
826 | */ | ||
827 | down_read((&EXT4_I(inode)->i_data_sem)); | ||
823 | block = ext4_new_meta_blocks(handle, inode, goal, 0, | 828 | block = ext4_new_meta_blocks(handle, inode, goal, 0, |
824 | NULL, &error); | 829 | NULL, &error); |
830 | up_read((&EXT4_I(inode)->i_data_sem)); | ||
825 | if (error) | 831 | if (error) |
826 | goto cleanup; | 832 | goto cleanup; |
827 | 833 | ||