diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/Makefile | 2 | ||||
-rw-r--r-- | fs/ext4/acl.c | 9 | ||||
-rw-r--r-- | fs/ext4/balloc.c | 48 | ||||
-rw-r--r-- | fs/ext4/block_validity.c | 21 | ||||
-rw-r--r-- | fs/ext4/ext4.h | 55 | ||||
-rw-r--r-- | fs/ext4/extents.c | 129 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 26 | ||||
-rw-r--r-- | fs/ext4/ialloc.c | 2 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 1482 | ||||
-rw-r--r-- | fs/ext4/inode.c | 1596 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 12 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 230 | ||||
-rw-r--r-- | fs/ext4/mballoc.h | 1 | ||||
-rw-r--r-- | fs/ext4/namei.c | 21 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 6 | ||||
-rw-r--r-- | fs/ext4/resize.c | 199 | ||||
-rw-r--r-- | fs/ext4/super.c | 88 | ||||
-rw-r--r-- | fs/ext4/truncate.h | 43 |
18 files changed, 2090 insertions, 1880 deletions
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9e..56fd8f865930 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile | |||
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o | |||
7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ | 7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ |
8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ | 8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ |
9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ | 9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ |
10 | mmp.o | 10 | mmp.o indirect.o |
11 | 11 | ||
12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o | 12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o |
13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o | 13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o |
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dca2d1ded931..a5c29bb3b835 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c | |||
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |||
198 | case ACL_TYPE_ACCESS: | 198 | case ACL_TYPE_ACCESS: |
199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | 199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; |
200 | if (acl) { | 200 | if (acl) { |
201 | mode_t mode = inode->i_mode; | 201 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
202 | error = posix_acl_equiv_mode(acl, &mode); | ||
203 | if (error < 0) | 202 | if (error < 0) |
204 | return error; | 203 | return error; |
205 | else { | 204 | else { |
206 | inode->i_mode = mode; | ||
207 | inode->i_ctime = ext4_current_time(inode); | 205 | inode->i_ctime = ext4_current_time(inode); |
208 | ext4_mark_inode_dirty(handle, inode); | 206 | ext4_mark_inode_dirty(handle, inode); |
209 | if (error == 0) | 207 | if (error == 0) |
@@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
259 | inode->i_mode &= ~current_umask(); | 257 | inode->i_mode &= ~current_umask(); |
260 | } | 258 | } |
261 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { | 259 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { |
262 | mode_t mode = inode->i_mode; | ||
263 | |||
264 | if (S_ISDIR(inode->i_mode)) { | 260 | if (S_ISDIR(inode->i_mode)) { |
265 | error = ext4_set_acl(handle, inode, | 261 | error = ext4_set_acl(handle, inode, |
266 | ACL_TYPE_DEFAULT, acl); | 262 | ACL_TYPE_DEFAULT, acl); |
267 | if (error) | 263 | if (error) |
268 | goto cleanup; | 264 | goto cleanup; |
269 | } | 265 | } |
270 | error = posix_acl_create(&acl, GFP_NOFS, &mode); | 266 | error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
271 | if (error < 0) | 267 | if (error < 0) |
272 | return error; | 268 | return error; |
273 | 269 | ||
274 | inode->i_mode = mode; | ||
275 | if (error > 0) { | 270 | if (error > 0) { |
276 | /* This is an extended ACL */ | 271 | /* This is an extended ACL */ |
277 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); | 272 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..f8224adf496e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) | |||
620 | 620 | ||
621 | } | 621 | } |
622 | 622 | ||
623 | /** | ||
624 | * ext4_inode_to_goal_block - return a hint for block allocation | ||
625 | * @inode: inode for block allocation | ||
626 | * | ||
627 | * Return the ideal location to start allocating blocks for a | ||
628 | * newly created inode. | ||
629 | */ | ||
630 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) | ||
631 | { | ||
632 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
633 | ext4_group_t block_group; | ||
634 | ext4_grpblk_t colour; | ||
635 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
636 | ext4_fsblk_t bg_start; | ||
637 | ext4_fsblk_t last_block; | ||
638 | |||
639 | block_group = ei->i_block_group; | ||
640 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
641 | /* | ||
642 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
643 | * block groups per flexgroup, reserve the first block | ||
644 | * group for directories and special files. Regular | ||
645 | * files will start at the second block group. This | ||
646 | * tends to speed up directory access and improves | ||
647 | * fsck times. | ||
648 | */ | ||
649 | block_group &= ~(flex_size-1); | ||
650 | if (S_ISREG(inode->i_mode)) | ||
651 | block_group++; | ||
652 | } | ||
653 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
654 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
655 | |||
656 | /* | ||
657 | * If we are doing delayed allocation, we don't need take | ||
658 | * colour into account. | ||
659 | */ | ||
660 | if (test_opt(inode->i_sb, DELALLOC)) | ||
661 | return bg_start; | ||
662 | |||
663 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
664 | colour = (current->pid % 16) * | ||
665 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
666 | else | ||
667 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
668 | return bg_start + colour; | ||
669 | } | ||
670 | |||
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba80..8efb2f0a3447 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c | |||
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, | |||
246 | return 1; | 246 | return 1; |
247 | } | 247 | } |
248 | 248 | ||
249 | int ext4_check_blockref(const char *function, unsigned int line, | ||
250 | struct inode *inode, __le32 *p, unsigned int max) | ||
251 | { | ||
252 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
253 | __le32 *bref = p; | ||
254 | unsigned int blk; | ||
255 | |||
256 | while (bref < p+max) { | ||
257 | blk = le32_to_cpu(*bref++); | ||
258 | if (blk && | ||
259 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
260 | blk, 1))) { | ||
261 | es->s_last_error_block = cpu_to_le64(blk); | ||
262 | ext4_error_inode(inode, function, line, blk, | ||
263 | "invalid block"); | ||
264 | return -EIO; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa44df879711..e717dfd2f2b4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -526,6 +526,7 @@ struct ext4_new_group_data { | |||
526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 | 526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 |
527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 | 527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 |
528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 | 528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 |
529 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 | ||
529 | 530 | ||
530 | /* | 531 | /* |
531 | * ioctl commands | 532 | * ioctl commands |
@@ -939,6 +940,8 @@ struct ext4_inode_info { | |||
939 | #define ext4_find_next_zero_bit find_next_zero_bit_le | 940 | #define ext4_find_next_zero_bit find_next_zero_bit_le |
940 | #define ext4_find_next_bit find_next_bit_le | 941 | #define ext4_find_next_bit find_next_bit_le |
941 | 942 | ||
943 | extern void ext4_set_bits(void *bm, int cur, int len); | ||
944 | |||
942 | /* | 945 | /* |
943 | * Maximal mount counts between two filesystem checks | 946 | * Maximal mount counts between two filesystem checks |
944 | */ | 947 | */ |
@@ -1126,7 +1129,8 @@ struct ext4_sb_info { | |||
1126 | struct journal_s *s_journal; | 1129 | struct journal_s *s_journal; |
1127 | struct list_head s_orphan; | 1130 | struct list_head s_orphan; |
1128 | struct mutex s_orphan_lock; | 1131 | struct mutex s_orphan_lock; |
1129 | struct mutex s_resize_lock; | 1132 | unsigned long s_resize_flags; /* Flags indicating if there |
1133 | is a resizer */ | ||
1130 | unsigned long s_commit_interval; | 1134 | unsigned long s_commit_interval; |
1131 | u32 s_max_batch_time; | 1135 | u32 s_max_batch_time; |
1132 | u32 s_min_batch_time; | 1136 | u32 s_min_batch_time; |
@@ -1214,6 +1218,9 @@ struct ext4_sb_info { | |||
1214 | 1218 | ||
1215 | /* Kernel thread for multiple mount protection */ | 1219 | /* Kernel thread for multiple mount protection */ |
1216 | struct task_struct *s_mmp_tsk; | 1220 | struct task_struct *s_mmp_tsk; |
1221 | |||
1222 | /* record the last minlen when FITRIM is called. */ | ||
1223 | atomic_t s_last_trim_minblks; | ||
1217 | }; | 1224 | }; |
1218 | 1225 | ||
1219 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1226 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
@@ -1743,6 +1750,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, | |||
1743 | struct ext4_group_desc *desc); | 1750 | struct ext4_group_desc *desc); |
1744 | #define ext4_free_blocks_after_init(sb, group, desc) \ | 1751 | #define ext4_free_blocks_after_init(sb, group, desc) \ |
1745 | ext4_init_block_bitmap(sb, NULL, group, desc) | 1752 | ext4_init_block_bitmap(sb, NULL, group, desc) |
1753 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); | ||
1746 | 1754 | ||
1747 | /* dir.c */ | 1755 | /* dir.c */ |
1748 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, | 1756 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, |
@@ -1793,7 +1801,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
1793 | unsigned long count, int flags); | 1801 | unsigned long count, int flags); |
1794 | extern int ext4_mb_add_groupinfo(struct super_block *sb, | 1802 | extern int ext4_mb_add_groupinfo(struct super_block *sb, |
1795 | ext4_group_t i, struct ext4_group_desc *desc); | 1803 | ext4_group_t i, struct ext4_group_desc *desc); |
1796 | extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 1804 | extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
1797 | ext4_fsblk_t block, unsigned long count); | 1805 | ext4_fsblk_t block, unsigned long count); |
1798 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); | 1806 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); |
1799 | 1807 | ||
@@ -1834,6 +1842,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | |||
1834 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 1842 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
1835 | extern void ext4_da_update_reserve_space(struct inode *inode, | 1843 | extern void ext4_da_update_reserve_space(struct inode *inode, |
1836 | int used, int quota_claim); | 1844 | int used, int quota_claim); |
1845 | |||
1846 | /* indirect.c */ | ||
1847 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
1848 | struct ext4_map_blocks *map, int flags); | ||
1849 | extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
1850 | const struct iovec *iov, loff_t offset, | ||
1851 | unsigned long nr_segs); | ||
1852 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | ||
1853 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); | ||
1854 | extern void ext4_ind_truncate(struct inode *inode); | ||
1855 | |||
1837 | /* ioctl.c */ | 1856 | /* ioctl.c */ |
1838 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 1857 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
1839 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | 1858 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
@@ -1855,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, | |||
1855 | ext4_fsblk_t n_blocks_count); | 1874 | ext4_fsblk_t n_blocks_count); |
1856 | 1875 | ||
1857 | /* super.c */ | 1876 | /* super.c */ |
1877 | extern void *ext4_kvmalloc(size_t size, gfp_t flags); | ||
1878 | extern void *ext4_kvzalloc(size_t size, gfp_t flags); | ||
1879 | extern void ext4_kvfree(void *ptr); | ||
1858 | extern void __ext4_error(struct super_block *, const char *, unsigned int, | 1880 | extern void __ext4_error(struct super_block *, const char *, unsigned int, |
1859 | const char *, ...) | 1881 | const char *, ...) |
1860 | __attribute__ ((format (printf, 4, 5))); | 1882 | __attribute__ ((format (printf, 4, 5))); |
@@ -2067,11 +2089,19 @@ struct ext4_group_info { | |||
2067 | * 5 free 8-block regions. */ | 2089 | * 5 free 8-block regions. */ |
2068 | }; | 2090 | }; |
2069 | 2091 | ||
2070 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | 2092 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 |
2093 | #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 | ||
2071 | 2094 | ||
2072 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | 2095 | #define EXT4_MB_GRP_NEED_INIT(grp) \ |
2073 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | 2096 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) |
2074 | 2097 | ||
2098 | #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ | ||
2099 | (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2100 | #define EXT4_MB_GRP_SET_TRIMMED(grp) \ | ||
2101 | (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2102 | #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ | ||
2103 | (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2104 | |||
2075 | #define EXT4_MAX_CONTENTION 8 | 2105 | #define EXT4_MAX_CONTENTION 8 |
2076 | #define EXT4_CONTENTION_THRESHOLD 2 | 2106 | #define EXT4_CONTENTION_THRESHOLD 2 |
2077 | 2107 | ||
@@ -2123,6 +2153,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) | |||
2123 | } | 2153 | } |
2124 | 2154 | ||
2125 | /* | 2155 | /* |
2156 | * Block validity checking | ||
2157 | */ | ||
2158 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
2159 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2160 | (__le32 *)(bh)->b_data, \ | ||
2161 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
2162 | |||
2163 | #define ext4_ind_check_inode(inode) \ | ||
2164 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2165 | EXT4_I(inode)->i_data, \ | ||
2166 | EXT4_NDIR_BLOCKS) | ||
2167 | |||
2168 | /* | ||
2126 | * Inodes and files operations | 2169 | * Inodes and files operations |
2127 | */ | 2170 | */ |
2128 | 2171 | ||
@@ -2151,6 +2194,8 @@ extern void ext4_exit_system_zone(void); | |||
2151 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, | 2194 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, |
2152 | ext4_fsblk_t start_blk, | 2195 | ext4_fsblk_t start_blk, |
2153 | unsigned int count); | 2196 | unsigned int count); |
2197 | extern int ext4_check_blockref(const char *, unsigned int, | ||
2198 | struct inode *, __le32 *, unsigned int); | ||
2154 | 2199 | ||
2155 | /* extents.c */ | 2200 | /* extents.c */ |
2156 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | 2201 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); |
@@ -2230,6 +2275,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) | |||
2230 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; | 2275 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; |
2231 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; | 2276 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; |
2232 | 2277 | ||
2278 | #define EXT4_RESIZING 0 | ||
2279 | extern int ext4_resize_begin(struct super_block *sb); | ||
2280 | extern void ext4_resize_end(struct super_block *sb); | ||
2281 | |||
2233 | #endif /* __KERNEL__ */ | 2282 | #endif /* __KERNEL__ */ |
2234 | 2283 | ||
2235 | #endif /* _EXT4_H */ | 2284 | #endif /* _EXT4_H */ |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a2..57cf568a98ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
114 | struct ext4_ext_path *path, | 114 | struct ext4_ext_path *path, |
115 | ext4_lblk_t block) | 115 | ext4_lblk_t block) |
116 | { | 116 | { |
117 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
118 | ext4_fsblk_t bg_start; | ||
119 | ext4_fsblk_t last_block; | ||
120 | ext4_grpblk_t colour; | ||
121 | ext4_group_t block_group; | ||
122 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
123 | int depth; | 117 | int depth; |
124 | 118 | ||
125 | if (path) { | 119 | if (path) { |
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
161 | } | 155 | } |
162 | 156 | ||
163 | /* OK. use inode's group */ | 157 | /* OK. use inode's group */ |
164 | block_group = ei->i_block_group; | 158 | return ext4_inode_to_goal_block(inode); |
165 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
166 | /* | ||
167 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
168 | * block groups per flexgroup, reserve the first block | ||
169 | * group for directories and special files. Regular | ||
170 | * files will start at the second block group. This | ||
171 | * tends to speed up directory access and improves | ||
172 | * fsck times. | ||
173 | */ | ||
174 | block_group &= ~(flex_size-1); | ||
175 | if (S_ISREG(inode->i_mode)) | ||
176 | block_group++; | ||
177 | } | ||
178 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
179 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
180 | |||
181 | /* | ||
182 | * If we are doing delayed allocation, we don't need take | ||
183 | * colour into account. | ||
184 | */ | ||
185 | if (test_opt(inode->i_sb, DELALLOC)) | ||
186 | return bg_start; | ||
187 | |||
188 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
189 | colour = (current->pid % 16) * | ||
190 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
191 | else | ||
192 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
193 | return bg_start + colour + block; | ||
194 | } | 159 | } |
195 | 160 | ||
196 | /* | 161 | /* |
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
776 | logical, le32_to_cpu(curp->p_idx->ei_block)); | 741 | logical, le32_to_cpu(curp->p_idx->ei_block)); |
777 | return -EIO; | 742 | return -EIO; |
778 | } | 743 | } |
744 | |||
745 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
746 | >= le16_to_cpu(curp->p_hdr->eh_max))) { | ||
747 | EXT4_ERROR_INODE(inode, | ||
748 | "eh_entries %d >= eh_max %d!", | ||
749 | le16_to_cpu(curp->p_hdr->eh_entries), | ||
750 | le16_to_cpu(curp->p_hdr->eh_max)); | ||
751 | return -EIO; | ||
752 | } | ||
753 | |||
779 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; | 754 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; |
780 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { | 755 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { |
781 | /* insert after */ | 756 | /* insert after */ |
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
805 | ext4_idx_store_pblock(ix, ptr); | 780 | ext4_idx_store_pblock(ix, ptr); |
806 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); | 781 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); |
807 | 782 | ||
808 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
809 | > le16_to_cpu(curp->p_hdr->eh_max))) { | ||
810 | EXT4_ERROR_INODE(inode, | ||
811 | "logical %d == ei_block %d!", | ||
812 | logical, le32_to_cpu(curp->p_idx->ei_block)); | ||
813 | return -EIO; | ||
814 | } | ||
815 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { | 783 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { |
816 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); | 784 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); |
817 | return -EIO; | 785 | return -EIO; |
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) | |||
1446 | * ext4_ext_next_leaf_block: | 1414 | * ext4_ext_next_leaf_block: |
1447 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS | 1415 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS |
1448 | */ | 1416 | */ |
1449 | static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, | 1417 | static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) |
1450 | struct ext4_ext_path *path) | ||
1451 | { | 1418 | { |
1452 | int depth; | 1419 | int depth; |
1453 | 1420 | ||
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1757 | goto merge; | 1724 | goto merge; |
1758 | } | 1725 | } |
1759 | 1726 | ||
1760 | repeat: | ||
1761 | depth = ext_depth(inode); | 1727 | depth = ext_depth(inode); |
1762 | eh = path[depth].p_hdr; | 1728 | eh = path[depth].p_hdr; |
1763 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) | 1729 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) |
@@ -1765,9 +1731,10 @@ repeat: | |||
1765 | 1731 | ||
1766 | /* probably next leaf has space for us? */ | 1732 | /* probably next leaf has space for us? */ |
1767 | fex = EXT_LAST_EXTENT(eh); | 1733 | fex = EXT_LAST_EXTENT(eh); |
1768 | next = ext4_ext_next_leaf_block(inode, path); | 1734 | next = EXT_MAX_BLOCKS; |
1769 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) | 1735 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) |
1770 | && next != EXT_MAX_BLOCKS) { | 1736 | next = ext4_ext_next_leaf_block(path); |
1737 | if (next != EXT_MAX_BLOCKS) { | ||
1771 | ext_debug("next leaf block - %d\n", next); | 1738 | ext_debug("next leaf block - %d\n", next); |
1772 | BUG_ON(npath != NULL); | 1739 | BUG_ON(npath != NULL); |
1773 | npath = ext4_ext_find_extent(inode, next, NULL); | 1740 | npath = ext4_ext_find_extent(inode, next, NULL); |
@@ -1779,7 +1746,7 @@ repeat: | |||
1779 | ext_debug("next leaf isn't full(%d)\n", | 1746 | ext_debug("next leaf isn't full(%d)\n", |
1780 | le16_to_cpu(eh->eh_entries)); | 1747 | le16_to_cpu(eh->eh_entries)); |
1781 | path = npath; | 1748 | path = npath; |
1782 | goto repeat; | 1749 | goto has_space; |
1783 | } | 1750 | } |
1784 | ext_debug("next leaf has no free space(%d,%d)\n", | 1751 | ext_debug("next leaf has no free space(%d,%d)\n", |
1785 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); | 1752 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); |
@@ -1839,7 +1806,7 @@ has_space: | |||
1839 | ext4_ext_pblock(newext), | 1806 | ext4_ext_pblock(newext), |
1840 | ext4_ext_is_uninitialized(newext), | 1807 | ext4_ext_is_uninitialized(newext), |
1841 | ext4_ext_get_actual_len(newext), | 1808 | ext4_ext_get_actual_len(newext), |
1842 | nearex, len, nearex + 1, nearex + 2); | 1809 | nearex, len, nearex, nearex + 1); |
1843 | memmove(nearex + 1, nearex, len); | 1810 | memmove(nearex + 1, nearex, len); |
1844 | path[depth].p_ext = nearex; | 1811 | path[depth].p_ext = nearex; |
1845 | } | 1812 | } |
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | |||
2052 | } | 2019 | } |
2053 | 2020 | ||
2054 | /* | 2021 | /* |
2055 | * ext4_ext_in_cache() | 2022 | * ext4_ext_check_cache() |
2056 | * Checks to see if the given block is in the cache. | 2023 | * Checks to see if the given block is in the cache. |
2057 | * If it is, the cached extent is stored in the given | 2024 | * If it is, the cached extent is stored in the given |
2058 | * cache extent pointer. If the cached extent is a hole, | 2025 | * cache extent pointer. If the cached extent is a hole, |
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, | |||
2134 | /* | 2101 | /* |
2135 | * ext4_ext_rm_idx: | 2102 | * ext4_ext_rm_idx: |
2136 | * removes index from the index block. | 2103 | * removes index from the index block. |
2137 | * It's used in truncate case only, thus all requests are for | ||
2138 | * last index in the block only. | ||
2139 | */ | 2104 | */ |
2140 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | 2105 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, |
2141 | struct ext4_ext_path *path) | 2106 | struct ext4_ext_path *path) |
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
2153 | err = ext4_ext_get_access(handle, inode, path); | 2118 | err = ext4_ext_get_access(handle, inode, path); |
2154 | if (err) | 2119 | if (err) |
2155 | return err; | 2120 | return err; |
2121 | |||
2122 | if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { | ||
2123 | int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; | ||
2124 | len *= sizeof(struct ext4_extent_idx); | ||
2125 | memmove(path->p_idx, path->p_idx + 1, len); | ||
2126 | } | ||
2127 | |||
2156 | le16_add_cpu(&path->p_hdr->eh_entries, -1); | 2128 | le16_add_cpu(&path->p_hdr->eh_entries, -1); |
2157 | err = ext4_ext_dirty(handle, inode, path); | 2129 | err = ext4_ext_dirty(handle, inode, path); |
2158 | if (err) | 2130 | if (err) |
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) | |||
2534 | return 1; | 2506 | return 1; |
2535 | } | 2507 | } |
2536 | 2508 | ||
2537 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | 2509 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) |
2538 | ext4_lblk_t end) | ||
2539 | { | 2510 | { |
2540 | struct super_block *sb = inode->i_sb; | 2511 | struct super_block *sb = inode->i_sb; |
2541 | int depth = ext_depth(inode); | 2512 | int depth = ext_depth(inode); |
@@ -2575,7 +2546,7 @@ again: | |||
2575 | if (i == depth) { | 2546 | if (i == depth) { |
2576 | /* this is leaf block */ | 2547 | /* this is leaf block */ |
2577 | err = ext4_ext_rm_leaf(handle, inode, path, | 2548 | err = ext4_ext_rm_leaf(handle, inode, path, |
2578 | start, end); | 2549 | start, EXT_MAX_BLOCKS - 1); |
2579 | /* root level has p_bh == NULL, brelse() eats this */ | 2550 | /* root level has p_bh == NULL, brelse() eats this */ |
2580 | brelse(path[i].p_bh); | 2551 | brelse(path[i].p_bh); |
2581 | path[i].p_bh = NULL; | 2552 | path[i].p_bh = NULL; |
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, | |||
3107 | struct ext4_ext_path *path) | 3078 | struct ext4_ext_path *path) |
3108 | { | 3079 | { |
3109 | struct ext4_extent *ex; | 3080 | struct ext4_extent *ex; |
3110 | struct ext4_extent_header *eh; | ||
3111 | int depth; | 3081 | int depth; |
3112 | int err = 0; | 3082 | int err = 0; |
3113 | 3083 | ||
3114 | depth = ext_depth(inode); | 3084 | depth = ext_depth(inode); |
3115 | eh = path[depth].p_hdr; | ||
3116 | ex = path[depth].p_ext; | 3085 | ex = path[depth].p_ext; |
3117 | 3086 | ||
3118 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" | 3087 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" |
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3357 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | 3326 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); |
3358 | 3327 | ||
3359 | /* check in cache */ | 3328 | /* check in cache */ |
3360 | if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && | 3329 | if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && |
3361 | ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { | 3330 | ext4_ext_in_cache(inode, map->m_lblk, &newex)) { |
3362 | if (!newex.ee_start_lo && !newex.ee_start_hi) { | 3331 | if (!newex.ee_start_lo && !newex.ee_start_hi) { |
3363 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | 3332 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
3364 | /* | 3333 | /* |
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3497 | 3466 | ||
3498 | ext4_ext_mark_uninitialized(ex); | 3467 | ext4_ext_mark_uninitialized(ex); |
3499 | 3468 | ||
3500 | err = ext4_ext_remove_space(inode, map->m_lblk, | 3469 | ext4_ext_invalidate_cache(inode); |
3501 | map->m_lblk + punched_out); | 3470 | |
3471 | err = ext4_ext_rm_leaf(handle, inode, path, | ||
3472 | map->m_lblk, map->m_lblk + punched_out); | ||
3473 | |||
3474 | if (!err && path->p_hdr->eh_entries == 0) { | ||
3475 | /* | ||
3476 | * Punch hole freed all of this sub tree, | ||
3477 | * so we need to correct eh_depth | ||
3478 | */ | ||
3479 | err = ext4_ext_get_access(handle, inode, path); | ||
3480 | if (err == 0) { | ||
3481 | ext_inode_hdr(inode)->eh_depth = 0; | ||
3482 | ext_inode_hdr(inode)->eh_max = | ||
3483 | cpu_to_le16(ext4_ext_space_root( | ||
3484 | inode, 0)); | ||
3485 | |||
3486 | err = ext4_ext_dirty( | ||
3487 | handle, inode, path); | ||
3488 | } | ||
3489 | } | ||
3502 | 3490 | ||
3503 | goto out2; | 3491 | goto out2; |
3504 | } | 3492 | } |
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3596 | } | 3584 | } |
3597 | 3585 | ||
3598 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); | 3586 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); |
3599 | if (err) | 3587 | if (!err) |
3600 | goto out2; | 3588 | err = ext4_ext_insert_extent(handle, inode, path, |
3601 | 3589 | &newex, flags); | |
3602 | err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | ||
3603 | if (err) { | 3590 | if (err) { |
3591 | int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? | ||
3592 | EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; | ||
3604 | /* free data blocks we just allocated */ | 3593 | /* free data blocks we just allocated */ |
3605 | /* not a good idea to call discard here directly, | 3594 | /* not a good idea to call discard here directly, |
3606 | * but otherwise we'd need to call it every free() */ | 3595 | * but otherwise we'd need to call it every free() */ |
3607 | ext4_discard_preallocations(inode); | 3596 | ext4_discard_preallocations(inode); |
3608 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), | 3597 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), |
3609 | ext4_ext_get_actual_len(&newex), 0); | 3598 | ext4_ext_get_actual_len(&newex), fb_flags); |
3610 | goto out2; | 3599 | goto out2; |
3611 | } | 3600 | } |
3612 | 3601 | ||
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
3699 | 3688 | ||
3700 | last_block = (inode->i_size + sb->s_blocksize - 1) | 3689 | last_block = (inode->i_size + sb->s_blocksize - 1) |
3701 | >> EXT4_BLOCK_SIZE_BITS(sb); | 3690 | >> EXT4_BLOCK_SIZE_BITS(sb); |
3702 | err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); | 3691 | err = ext4_ext_remove_space(inode, last_block); |
3703 | 3692 | ||
3704 | /* In a multi-transaction truncate, we only make the final | 3693 | /* In a multi-transaction truncate, we only make the final |
3705 | * transaction synchronous. | 3694 | * transaction synchronous. |
@@ -3835,7 +3824,7 @@ retry: | |||
3835 | blkbits) >> blkbits)) | 3824 | blkbits) >> blkbits)) |
3836 | new_size = offset + len; | 3825 | new_size = offset + len; |
3837 | else | 3826 | else |
3838 | new_size = (map.m_lblk + ret) << blkbits; | 3827 | new_size = ((loff_t) map.m_lblk + ret) << blkbits; |
3839 | 3828 | ||
3840 | ext4_falloc_update_inode(inode, mode, new_size, | 3829 | ext4_falloc_update_inode(inode, mode, new_size, |
3841 | (map.m_flags & EXT4_MAP_NEW)); | 3830 | (map.m_flags & EXT4_MAP_NEW)); |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index da3bed3e0c29..036f78f7a1ef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) | |||
129 | { | 129 | { |
130 | struct writeback_control wbc; | 130 | struct writeback_control wbc; |
131 | struct dentry *dentry = NULL; | 131 | struct dentry *dentry = NULL; |
132 | struct inode *next; | ||
132 | int ret = 0; | 133 | int ret = 0; |
133 | 134 | ||
134 | while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | 135 | if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) |
136 | return 0; | ||
137 | inode = igrab(inode); | ||
138 | while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | ||
135 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); | 139 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); |
136 | dentry = list_entry(inode->i_dentry.next, | 140 | dentry = NULL; |
137 | struct dentry, d_alias); | 141 | spin_lock(&inode->i_lock); |
138 | if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) | 142 | if (!list_empty(&inode->i_dentry)) { |
143 | dentry = list_first_entry(&inode->i_dentry, | ||
144 | struct dentry, d_alias); | ||
145 | dget(dentry); | ||
146 | } | ||
147 | spin_unlock(&inode->i_lock); | ||
148 | if (!dentry) | ||
139 | break; | 149 | break; |
140 | inode = dentry->d_parent->d_inode; | 150 | next = igrab(dentry->d_parent->d_inode); |
151 | dput(dentry); | ||
152 | if (!next) | ||
153 | break; | ||
154 | iput(inode); | ||
155 | inode = next; | ||
141 | ret = sync_mapping_buffers(inode->i_mapping); | 156 | ret = sync_mapping_buffers(inode->i_mapping); |
142 | if (ret) | 157 | if (ret) |
143 | break; | 158 | break; |
@@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode) | |||
148 | if (ret) | 163 | if (ret) |
149 | break; | 164 | break; |
150 | } | 165 | } |
166 | iput(inode); | ||
151 | return ret; | 167 | return ret; |
152 | } | 168 | } |
153 | 169 | ||
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f61e502..9c63f273b550 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, | |||
1287 | group, used_blks, | 1287 | group, used_blks, |
1288 | ext4_itable_unused_count(sb, gdp)); | 1288 | ext4_itable_unused_count(sb, gdp)); |
1289 | ret = 1; | 1289 | ret = 1; |
1290 | goto out; | 1290 | goto err_out; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | blk = ext4_inode_table(sb, gdp) + used_blks; | 1293 | blk = ext4_inode_table(sb, gdp) + used_blks; |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 000000000000..b8602cde5b5a --- /dev/null +++ b/fs/ext4/indirect.c | |||
@@ -0,0 +1,1482 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/indirect.c | ||
3 | * | ||
4 | * from | ||
5 | * | ||
6 | * linux/fs/ext4/inode.c | ||
7 | * | ||
8 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
9 | * Remy Card (card@masi.ibp.fr) | ||
10 | * Laboratoire MASI - Institut Blaise Pascal | ||
11 | * Universite Pierre et Marie Curie (Paris VI) | ||
12 | * | ||
13 | * from | ||
14 | * | ||
15 | * linux/fs/minix/inode.c | ||
16 | * | ||
17 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
18 | * | ||
19 | * Goal-directed block allocation by Stephen Tweedie | ||
20 | * (sct@redhat.com), 1993, 1998 | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include "ext4_jbd2.h" | ||
25 | #include "truncate.h" | ||
26 | |||
27 | #include <trace/events/ext4.h> | ||
28 | |||
29 | typedef struct { | ||
30 | __le32 *p; | ||
31 | __le32 key; | ||
32 | struct buffer_head *bh; | ||
33 | } Indirect; | ||
34 | |||
35 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
36 | { | ||
37 | p->key = *(p->p = v); | ||
38 | p->bh = bh; | ||
39 | } | ||
40 | |||
41 | /** | ||
42 | * ext4_block_to_path - parse the block number into array of offsets | ||
43 | * @inode: inode in question (we are only interested in its superblock) | ||
44 | * @i_block: block number to be parsed | ||
45 | * @offsets: array to store the offsets in | ||
46 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
47 | * followed (on disk) by an indirect block. | ||
48 | * | ||
49 | * To store the locations of file's data ext4 uses a data structure common | ||
50 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
51 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
52 | * This function translates the block number into path in that tree - | ||
53 | * return value is the path length and @offsets[n] is the offset of | ||
54 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
55 | * (negative or too large) warning is printed and zero returned. | ||
56 | * | ||
57 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
58 | * we need to know is the capacity of indirect blocks (taken from the | ||
59 | * inode->i_sb). | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * Portability note: the last comparison (check that we fit into triple | ||
64 | * indirect block) is spelled differently, because otherwise on an | ||
65 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
66 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
67 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
68 | * i_block would have to be negative in the very beginning, so we would not | ||
69 | * get there at all. | ||
70 | */ | ||
71 | |||
72 | static int ext4_block_to_path(struct inode *inode, | ||
73 | ext4_lblk_t i_block, | ||
74 | ext4_lblk_t offsets[4], int *boundary) | ||
75 | { | ||
76 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
77 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
78 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
79 | indirect_blocks = ptrs, | ||
80 | double_blocks = (1 << (ptrs_bits * 2)); | ||
81 | int n = 0; | ||
82 | int final = 0; | ||
83 | |||
84 | if (i_block < direct_blocks) { | ||
85 | offsets[n++] = i_block; | ||
86 | final = direct_blocks; | ||
87 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
88 | offsets[n++] = EXT4_IND_BLOCK; | ||
89 | offsets[n++] = i_block; | ||
90 | final = ptrs; | ||
91 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
92 | offsets[n++] = EXT4_DIND_BLOCK; | ||
93 | offsets[n++] = i_block >> ptrs_bits; | ||
94 | offsets[n++] = i_block & (ptrs - 1); | ||
95 | final = ptrs; | ||
96 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
97 | offsets[n++] = EXT4_TIND_BLOCK; | ||
98 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
99 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
100 | offsets[n++] = i_block & (ptrs - 1); | ||
101 | final = ptrs; | ||
102 | } else { | ||
103 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
104 | i_block + direct_blocks + | ||
105 | indirect_blocks + double_blocks, inode->i_ino); | ||
106 | } | ||
107 | if (boundary) | ||
108 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
109 | return n; | ||
110 | } | ||
111 | |||
112 | /** | ||
113 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
114 | * @inode: inode in question | ||
115 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
116 | * @offsets: offsets of pointers in inode/indirect blocks | ||
117 | * @chain: place to store the result | ||
118 | * @err: here we store the error value | ||
119 | * | ||
120 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
121 | * if everything went OK or the pointer to the last filled triple | ||
122 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
123 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
124 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
125 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
126 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
127 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
128 | * numbers of the chain, addresses they were taken from (and where we can | ||
129 | * verify that chain did not change) and buffer_heads hosting these | ||
130 | * numbers. | ||
131 | * | ||
132 | * Function stops when it stumbles upon zero pointer (absent block) | ||
133 | * (pointer to last triple returned, *@err == 0) | ||
134 | * or when it gets an IO error reading an indirect block | ||
135 | * (ditto, *@err == -EIO) | ||
136 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
137 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
138 | * | ||
139 | * Need to be called with | ||
140 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
141 | */ | ||
142 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
143 | ext4_lblk_t *offsets, | ||
144 | Indirect chain[4], int *err) | ||
145 | { | ||
146 | struct super_block *sb = inode->i_sb; | ||
147 | Indirect *p = chain; | ||
148 | struct buffer_head *bh; | ||
149 | |||
150 | *err = 0; | ||
151 | /* i_data is not going away, no lock needed */ | ||
152 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
153 | if (!p->key) | ||
154 | goto no_block; | ||
155 | while (--depth) { | ||
156 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
157 | if (unlikely(!bh)) | ||
158 | goto failure; | ||
159 | |||
160 | if (!bh_uptodate_or_lock(bh)) { | ||
161 | if (bh_submit_read(bh) < 0) { | ||
162 | put_bh(bh); | ||
163 | goto failure; | ||
164 | } | ||
165 | /* validate block references */ | ||
166 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
167 | put_bh(bh); | ||
168 | goto failure; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
173 | /* Reader: end */ | ||
174 | if (!p->key) | ||
175 | goto no_block; | ||
176 | } | ||
177 | return NULL; | ||
178 | |||
179 | failure: | ||
180 | *err = -EIO; | ||
181 | no_block: | ||
182 | return p; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * ext4_find_near - find a place for allocation with sufficient locality | ||
187 | * @inode: owner | ||
188 | * @ind: descriptor of indirect block. | ||
189 | * | ||
190 | * This function returns the preferred place for block allocation. | ||
191 | * It is used when heuristic for sequential allocation fails. | ||
192 | * Rules are: | ||
193 | * + if there is a block to the left of our position - allocate near it. | ||
194 | * + if pointer will live in indirect block - allocate near that block. | ||
195 | * + if pointer will live in inode - allocate in the same | ||
196 | * cylinder group. | ||
197 | * | ||
198 | * In the latter case we colour the starting block by the callers PID to | ||
199 | * prevent it from clashing with concurrent allocations for a different inode | ||
200 | * in the same block group. The PID is used here so that functionally related | ||
201 | * files will be close-by on-disk. | ||
202 | * | ||
203 | * Caller must make sure that @ind is valid and will stay that way. | ||
204 | */ | ||
205 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
206 | { | ||
207 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
208 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
209 | __le32 *p; | ||
210 | |||
211 | /* Try to find previous block */ | ||
212 | for (p = ind->p - 1; p >= start; p--) { | ||
213 | if (*p) | ||
214 | return le32_to_cpu(*p); | ||
215 | } | ||
216 | |||
217 | /* No such thing, so let's try location of indirect block */ | ||
218 | if (ind->bh) | ||
219 | return ind->bh->b_blocknr; | ||
220 | |||
221 | /* | ||
222 | * It is going to be referred to from the inode itself? OK, just put it | ||
223 | * into the same cylinder group then. | ||
224 | */ | ||
225 | return ext4_inode_to_goal_block(inode); | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * ext4_find_goal - find a preferred place for allocation. | ||
230 | * @inode: owner | ||
231 | * @block: block we want | ||
232 | * @partial: pointer to the last triple within a chain | ||
233 | * | ||
234 | * Normally this function find the preferred place for block allocation, | ||
235 | * returns it. | ||
236 | * Because this is only used for non-extent files, we limit the block nr | ||
237 | * to 32 bits. | ||
238 | */ | ||
239 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
240 | Indirect *partial) | ||
241 | { | ||
242 | ext4_fsblk_t goal; | ||
243 | |||
244 | /* | ||
245 | * XXX need to get goal block from mballoc's data structures | ||
246 | */ | ||
247 | |||
248 | goal = ext4_find_near(inode, partial); | ||
249 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
250 | return goal; | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
255 | * of direct blocks need to be allocated for the given branch. | ||
256 | * | ||
257 | * @branch: chain of indirect blocks | ||
258 | * @k: number of blocks need for indirect blocks | ||
259 | * @blks: number of data blocks to be mapped. | ||
260 | * @blocks_to_boundary: the offset in the indirect block | ||
261 | * | ||
262 | * return the total number of blocks to be allocate, including the | ||
263 | * direct and indirect blocks. | ||
264 | */ | ||
265 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
266 | int blocks_to_boundary) | ||
267 | { | ||
268 | unsigned int count = 0; | ||
269 | |||
270 | /* | ||
271 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
272 | * then it's clear blocks on that path have not allocated | ||
273 | */ | ||
274 | if (k > 0) { | ||
275 | /* right now we don't handle cross boundary allocation */ | ||
276 | if (blks < blocks_to_boundary + 1) | ||
277 | count += blks; | ||
278 | else | ||
279 | count += blocks_to_boundary + 1; | ||
280 | return count; | ||
281 | } | ||
282 | |||
283 | count++; | ||
284 | while (count < blks && count <= blocks_to_boundary && | ||
285 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
286 | count++; | ||
287 | } | ||
288 | return count; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
293 | * @handle: handle for this transaction | ||
294 | * @inode: inode which needs allocated blocks | ||
295 | * @iblock: the logical block to start allocated at | ||
296 | * @goal: preferred physical block of allocation | ||
297 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
298 | * blocks | ||
299 | * @blks: number of desired blocks | ||
300 | * @new_blocks: on return it will store the new block numbers for | ||
301 | * the indirect blocks(if needed) and the first direct block, | ||
302 | * @err: on return it will store the error code | ||
303 | * | ||
304 | * This function will return the number of blocks allocated as | ||
305 | * requested by the passed-in parameters. | ||
306 | */ | ||
307 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
308 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
309 | int indirect_blks, int blks, | ||
310 | ext4_fsblk_t new_blocks[4], int *err) | ||
311 | { | ||
312 | struct ext4_allocation_request ar; | ||
313 | int target, i; | ||
314 | unsigned long count = 0, blk_allocated = 0; | ||
315 | int index = 0; | ||
316 | ext4_fsblk_t current_block = 0; | ||
317 | int ret = 0; | ||
318 | |||
319 | /* | ||
320 | * Here we try to allocate the requested multiple blocks at once, | ||
321 | * on a best-effort basis. | ||
322 | * To build a branch, we should allocate blocks for | ||
323 | * the indirect blocks(if not allocated yet), and at least | ||
324 | * the first direct block of this branch. That's the | ||
325 | * minimum number of blocks need to allocate(required) | ||
326 | */ | ||
327 | /* first we try to allocate the indirect blocks */ | ||
328 | target = indirect_blks; | ||
329 | while (target > 0) { | ||
330 | count = target; | ||
331 | /* allocating blocks for indirect blocks and direct blocks */ | ||
332 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
333 | 0, &count, err); | ||
334 | if (*err) | ||
335 | goto failed_out; | ||
336 | |||
337 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
338 | EXT4_ERROR_INODE(inode, | ||
339 | "current_block %llu + count %lu > %d!", | ||
340 | current_block, count, | ||
341 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
342 | *err = -EIO; | ||
343 | goto failed_out; | ||
344 | } | ||
345 | |||
346 | target -= count; | ||
347 | /* allocate blocks for indirect blocks */ | ||
348 | while (index < indirect_blks && count) { | ||
349 | new_blocks[index++] = current_block++; | ||
350 | count--; | ||
351 | } | ||
352 | if (count > 0) { | ||
353 | /* | ||
354 | * save the new block number | ||
355 | * for the first direct block | ||
356 | */ | ||
357 | new_blocks[index] = current_block; | ||
358 | printk(KERN_INFO "%s returned more blocks than " | ||
359 | "requested\n", __func__); | ||
360 | WARN_ON(1); | ||
361 | break; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | target = blks - count ; | ||
366 | blk_allocated = count; | ||
367 | if (!target) | ||
368 | goto allocated; | ||
369 | /* Now allocate data blocks */ | ||
370 | memset(&ar, 0, sizeof(ar)); | ||
371 | ar.inode = inode; | ||
372 | ar.goal = goal; | ||
373 | ar.len = target; | ||
374 | ar.logical = iblock; | ||
375 | if (S_ISREG(inode->i_mode)) | ||
376 | /* enable in-core preallocation only for regular files */ | ||
377 | ar.flags = EXT4_MB_HINT_DATA; | ||
378 | |||
379 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
380 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
381 | EXT4_ERROR_INODE(inode, | ||
382 | "current_block %llu + ar.len %d > %d!", | ||
383 | current_block, ar.len, | ||
384 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
385 | *err = -EIO; | ||
386 | goto failed_out; | ||
387 | } | ||
388 | |||
389 | if (*err && (target == blks)) { | ||
390 | /* | ||
391 | * if the allocation failed and we didn't allocate | ||
392 | * any blocks before | ||
393 | */ | ||
394 | goto failed_out; | ||
395 | } | ||
396 | if (!*err) { | ||
397 | if (target == blks) { | ||
398 | /* | ||
399 | * save the new block number | ||
400 | * for the first direct block | ||
401 | */ | ||
402 | new_blocks[index] = current_block; | ||
403 | } | ||
404 | blk_allocated += ar.len; | ||
405 | } | ||
406 | allocated: | ||
407 | /* total number of blocks allocated for direct blocks */ | ||
408 | ret = blk_allocated; | ||
409 | *err = 0; | ||
410 | return ret; | ||
411 | failed_out: | ||
412 | for (i = 0; i < index; i++) | ||
413 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /** | ||
418 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
419 | * @handle: handle for this transaction | ||
420 | * @inode: owner | ||
421 | * @indirect_blks: number of allocated indirect blocks | ||
422 | * @blks: number of allocated direct blocks | ||
423 | * @goal: preferred place for allocation | ||
424 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
425 | * @branch: place to store the chain in. | ||
426 | * | ||
427 | * This function allocates blocks, zeroes out all but the last one, | ||
428 | * links them into chain and (if we are synchronous) writes them to disk. | ||
429 | * In other words, it prepares a branch that can be spliced onto the | ||
430 | * inode. It stores the information about that chain in the branch[], in | ||
431 | * the same format as ext4_get_branch() would do. We are calling it after | ||
432 | * we had read the existing part of chain and partial points to the last | ||
433 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
434 | * picture as after the successful ext4_get_block(), except that in one | ||
435 | * place chain is disconnected - *branch->p is still zero (we did not | ||
436 | * set the last link), but branch->key contains the number that should | ||
437 | * be placed into *branch->p to fill that gap. | ||
438 | * | ||
439 | * If allocation fails we free all blocks we've allocated (and forget | ||
440 | * their buffer_heads) and return the error value the from failed | ||
441 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
442 | * as described above and return 0. | ||
443 | */ | ||
444 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
445 | ext4_lblk_t iblock, int indirect_blks, | ||
446 | int *blks, ext4_fsblk_t goal, | ||
447 | ext4_lblk_t *offsets, Indirect *branch) | ||
448 | { | ||
449 | int blocksize = inode->i_sb->s_blocksize; | ||
450 | int i, n = 0; | ||
451 | int err = 0; | ||
452 | struct buffer_head *bh; | ||
453 | int num; | ||
454 | ext4_fsblk_t new_blocks[4]; | ||
455 | ext4_fsblk_t current_block; | ||
456 | |||
457 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
458 | *blks, new_blocks, &err); | ||
459 | if (err) | ||
460 | return err; | ||
461 | |||
462 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
463 | /* | ||
464 | * metadata blocks and data blocks are allocated. | ||
465 | */ | ||
466 | for (n = 1; n <= indirect_blks; n++) { | ||
467 | /* | ||
468 | * Get buffer_head for parent block, zero it out | ||
469 | * and set the pointer to new one, then send | ||
470 | * parent to disk. | ||
471 | */ | ||
472 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
473 | if (unlikely(!bh)) { | ||
474 | err = -EIO; | ||
475 | goto failed; | ||
476 | } | ||
477 | |||
478 | branch[n].bh = bh; | ||
479 | lock_buffer(bh); | ||
480 | BUFFER_TRACE(bh, "call get_create_access"); | ||
481 | err = ext4_journal_get_create_access(handle, bh); | ||
482 | if (err) { | ||
483 | /* Don't brelse(bh) here; it's done in | ||
484 | * ext4_journal_forget() below */ | ||
485 | unlock_buffer(bh); | ||
486 | goto failed; | ||
487 | } | ||
488 | |||
489 | memset(bh->b_data, 0, blocksize); | ||
490 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
491 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
492 | *branch[n].p = branch[n].key; | ||
493 | if (n == indirect_blks) { | ||
494 | current_block = new_blocks[n]; | ||
495 | /* | ||
496 | * End of chain, update the last new metablock of | ||
497 | * the chain to point to the new allocated | ||
498 | * data blocks numbers | ||
499 | */ | ||
500 | for (i = 1; i < num; i++) | ||
501 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
502 | } | ||
503 | BUFFER_TRACE(bh, "marking uptodate"); | ||
504 | set_buffer_uptodate(bh); | ||
505 | unlock_buffer(bh); | ||
506 | |||
507 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
508 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
509 | if (err) | ||
510 | goto failed; | ||
511 | } | ||
512 | *blks = num; | ||
513 | return err; | ||
514 | failed: | ||
515 | /* Allocation failed, free what we already allocated */ | ||
516 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
517 | for (i = 1; i <= n ; i++) { | ||
518 | /* | ||
519 | * branch[i].bh is newly allocated, so there is no | ||
520 | * need to revoke the block, which is why we don't | ||
521 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
522 | */ | ||
523 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
524 | EXT4_FREE_BLOCKS_FORGET); | ||
525 | } | ||
526 | for (i = n+1; i < indirect_blks; i++) | ||
527 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
528 | |||
529 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
530 | |||
531 | return err; | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
536 | * @handle: handle for this transaction | ||
537 | * @inode: owner | ||
538 | * @block: (logical) number of block we are adding | ||
539 | * @chain: chain of indirect blocks (with a missing link - see | ||
540 | * ext4_alloc_branch) | ||
541 | * @where: location of missing link | ||
542 | * @num: number of indirect blocks we are adding | ||
543 | * @blks: number of direct blocks we are adding | ||
544 | * | ||
545 | * This function fills the missing link and does all housekeeping needed in | ||
546 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
547 | * chain to new block and return 0. | ||
548 | */ | ||
549 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
550 | ext4_lblk_t block, Indirect *where, int num, | ||
551 | int blks) | ||
552 | { | ||
553 | int i; | ||
554 | int err = 0; | ||
555 | ext4_fsblk_t current_block; | ||
556 | |||
557 | /* | ||
558 | * If we're splicing into a [td]indirect block (as opposed to the | ||
559 | * inode) then we need to get write access to the [td]indirect block | ||
560 | * before the splice. | ||
561 | */ | ||
562 | if (where->bh) { | ||
563 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
564 | err = ext4_journal_get_write_access(handle, where->bh); | ||
565 | if (err) | ||
566 | goto err_out; | ||
567 | } | ||
568 | /* That's it */ | ||
569 | |||
570 | *where->p = where->key; | ||
571 | |||
572 | /* | ||
573 | * Update the host buffer_head or inode to point to more just allocated | ||
574 | * direct blocks blocks | ||
575 | */ | ||
576 | if (num == 0 && blks > 1) { | ||
577 | current_block = le32_to_cpu(where->key) + 1; | ||
578 | for (i = 1; i < blks; i++) | ||
579 | *(where->p + i) = cpu_to_le32(current_block++); | ||
580 | } | ||
581 | |||
582 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
583 | /* had we spliced it onto indirect block? */ | ||
584 | if (where->bh) { | ||
585 | /* | ||
586 | * If we spliced it onto an indirect block, we haven't | ||
587 | * altered the inode. Note however that if it is being spliced | ||
588 | * onto an indirect block at the very end of the file (the | ||
589 | * file is growing) then we *will* alter the inode to reflect | ||
590 | * the new i_size. But that is not done here - it is done in | ||
591 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
592 | */ | ||
593 | jbd_debug(5, "splicing indirect only\n"); | ||
594 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
595 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
596 | if (err) | ||
597 | goto err_out; | ||
598 | } else { | ||
599 | /* | ||
600 | * OK, we spliced it into the inode itself on a direct block. | ||
601 | */ | ||
602 | ext4_mark_inode_dirty(handle, inode); | ||
603 | jbd_debug(5, "splicing direct\n"); | ||
604 | } | ||
605 | return err; | ||
606 | |||
607 | err_out: | ||
608 | for (i = 1; i <= num; i++) { | ||
609 | /* | ||
610 | * branch[i].bh is newly allocated, so there is no | ||
611 | * need to revoke the block, which is why we don't | ||
612 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
613 | */ | ||
614 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
615 | EXT4_FREE_BLOCKS_FORGET); | ||
616 | } | ||
617 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
618 | blks, 0); | ||
619 | |||
620 | return err; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
625 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
626 | * scheme) for ext4_map_blocks(). | ||
627 | * | ||
628 | * Allocation strategy is simple: if we have to allocate something, we will | ||
629 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
630 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
631 | * required, recheck the path, free and repeat if check fails, otherwise | ||
632 | * set the last missing link (that will protect us from any truncate-generated | ||
633 | * removals - all blocks on the path are immune now) and possibly force the | ||
634 | * write on the parent block. | ||
635 | * That has a nice additional property: no special recovery from the failed | ||
636 | * allocations is needed - we simply release blocks and do not touch anything | ||
637 | * reachable from inode. | ||
638 | * | ||
639 | * `handle' can be NULL if create == 0. | ||
640 | * | ||
641 | * return > 0, # of blocks mapped or allocated. | ||
642 | * return = 0, if plain lookup failed. | ||
643 | * return < 0, error case. | ||
644 | * | ||
645 | * The ext4_ind_get_blocks() function should be called with | ||
646 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
647 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
648 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
649 | * blocks. | ||
650 | */ | ||
651 | int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
652 | struct ext4_map_blocks *map, | ||
653 | int flags) | ||
654 | { | ||
655 | int err = -EIO; | ||
656 | ext4_lblk_t offsets[4]; | ||
657 | Indirect chain[4]; | ||
658 | Indirect *partial; | ||
659 | ext4_fsblk_t goal; | ||
660 | int indirect_blks; | ||
661 | int blocks_to_boundary = 0; | ||
662 | int depth; | ||
663 | int count = 0; | ||
664 | ext4_fsblk_t first_block = 0; | ||
665 | |||
666 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
667 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
668 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
669 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
670 | &blocks_to_boundary); | ||
671 | |||
672 | if (depth == 0) | ||
673 | goto out; | ||
674 | |||
675 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
676 | |||
677 | /* Simplest case - block found, no allocation needed */ | ||
678 | if (!partial) { | ||
679 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
680 | count++; | ||
681 | /*map more blocks*/ | ||
682 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
683 | ext4_fsblk_t blk; | ||
684 | |||
685 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
686 | |||
687 | if (blk == first_block + count) | ||
688 | count++; | ||
689 | else | ||
690 | break; | ||
691 | } | ||
692 | goto got_it; | ||
693 | } | ||
694 | |||
695 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
696 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
697 | goto cleanup; | ||
698 | |||
699 | /* | ||
700 | * Okay, we need to do block allocation. | ||
701 | */ | ||
702 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
703 | |||
704 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
705 | indirect_blks = (chain + depth) - partial - 1; | ||
706 | |||
707 | /* | ||
708 | * Next look up the indirect map to count the totoal number of | ||
709 | * direct blocks to allocate for this branch. | ||
710 | */ | ||
711 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
712 | map->m_len, blocks_to_boundary); | ||
713 | /* | ||
714 | * Block out ext4_truncate while we alter the tree | ||
715 | */ | ||
716 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
717 | &count, goal, | ||
718 | offsets + (partial - chain), partial); | ||
719 | |||
720 | /* | ||
721 | * The ext4_splice_branch call will free and forget any buffers | ||
722 | * on the new chain if there is a failure, but that risks using | ||
723 | * up transaction credits, especially for bitmaps where the | ||
724 | * credits cannot be returned. Can we handle this somehow? We | ||
725 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
726 | */ | ||
727 | if (!err) | ||
728 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
729 | partial, indirect_blks, count); | ||
730 | if (err) | ||
731 | goto cleanup; | ||
732 | |||
733 | map->m_flags |= EXT4_MAP_NEW; | ||
734 | |||
735 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
736 | got_it: | ||
737 | map->m_flags |= EXT4_MAP_MAPPED; | ||
738 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
739 | map->m_len = count; | ||
740 | if (count > blocks_to_boundary) | ||
741 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
742 | err = count; | ||
743 | /* Clean up and exit */ | ||
744 | partial = chain + depth - 1; /* the whole chain */ | ||
745 | cleanup: | ||
746 | while (partial > chain) { | ||
747 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
748 | brelse(partial->bh); | ||
749 | partial--; | ||
750 | } | ||
751 | out: | ||
752 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
753 | map->m_pblk, map->m_len, err); | ||
754 | return err; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * O_DIRECT for ext3 (or indirect map) based files | ||
759 | * | ||
760 | * If the O_DIRECT write will extend the file then add this inode to the | ||
761 | * orphan list. So recovery will truncate it back to the original size | ||
762 | * if the machine crashes during the write. | ||
763 | * | ||
764 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
765 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
766 | * VFS code falls back into buffered path in that case so we are safe. | ||
767 | */ | ||
768 | ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
769 | const struct iovec *iov, loff_t offset, | ||
770 | unsigned long nr_segs) | ||
771 | { | ||
772 | struct file *file = iocb->ki_filp; | ||
773 | struct inode *inode = file->f_mapping->host; | ||
774 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
775 | handle_t *handle; | ||
776 | ssize_t ret; | ||
777 | int orphan = 0; | ||
778 | size_t count = iov_length(iov, nr_segs); | ||
779 | int retries = 0; | ||
780 | |||
781 | if (rw == WRITE) { | ||
782 | loff_t final_size = offset + count; | ||
783 | |||
784 | if (final_size > inode->i_size) { | ||
785 | /* Credits for sb + inode write */ | ||
786 | handle = ext4_journal_start(inode, 2); | ||
787 | if (IS_ERR(handle)) { | ||
788 | ret = PTR_ERR(handle); | ||
789 | goto out; | ||
790 | } | ||
791 | ret = ext4_orphan_add(handle, inode); | ||
792 | if (ret) { | ||
793 | ext4_journal_stop(handle); | ||
794 | goto out; | ||
795 | } | ||
796 | orphan = 1; | ||
797 | ei->i_disksize = inode->i_size; | ||
798 | ext4_journal_stop(handle); | ||
799 | } | ||
800 | } | ||
801 | |||
802 | retry: | ||
803 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
804 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
805 | inode->i_sb->s_bdev, iov, | ||
806 | offset, nr_segs, | ||
807 | ext4_get_block, NULL, NULL, 0); | ||
808 | else { | ||
809 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
810 | offset, nr_segs, ext4_get_block); | ||
811 | |||
812 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
813 | loff_t isize = i_size_read(inode); | ||
814 | loff_t end = offset + iov_length(iov, nr_segs); | ||
815 | |||
816 | if (end > isize) | ||
817 | ext4_truncate_failed_write(inode); | ||
818 | } | ||
819 | } | ||
820 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
821 | goto retry; | ||
822 | |||
823 | if (orphan) { | ||
824 | int err; | ||
825 | |||
826 | /* Credits for sb + inode write */ | ||
827 | handle = ext4_journal_start(inode, 2); | ||
828 | if (IS_ERR(handle)) { | ||
829 | /* This is really bad luck. We've written the data | ||
830 | * but cannot extend i_size. Bail out and pretend | ||
831 | * the write failed... */ | ||
832 | ret = PTR_ERR(handle); | ||
833 | if (inode->i_nlink) | ||
834 | ext4_orphan_del(NULL, inode); | ||
835 | |||
836 | goto out; | ||
837 | } | ||
838 | if (inode->i_nlink) | ||
839 | ext4_orphan_del(handle, inode); | ||
840 | if (ret > 0) { | ||
841 | loff_t end = offset + ret; | ||
842 | if (end > inode->i_size) { | ||
843 | ei->i_disksize = end; | ||
844 | i_size_write(inode, end); | ||
845 | /* | ||
846 | * We're going to return a positive `ret' | ||
847 | * here due to non-zero-length I/O, so there's | ||
848 | * no way of reporting error returns from | ||
849 | * ext4_mark_inode_dirty() to userspace. So | ||
850 | * ignore it. | ||
851 | */ | ||
852 | ext4_mark_inode_dirty(handle, inode); | ||
853 | } | ||
854 | } | ||
855 | err = ext4_journal_stop(handle); | ||
856 | if (ret == 0) | ||
857 | ret = err; | ||
858 | } | ||
859 | out: | ||
860 | return ret; | ||
861 | } | ||
862 | |||
863 | /* | ||
864 | * Calculate the number of metadata blocks need to reserve | ||
865 | * to allocate a new block at @lblocks for non extent file based file | ||
866 | */ | ||
867 | int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) | ||
868 | { | ||
869 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
870 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
871 | int blk_bits; | ||
872 | |||
873 | if (lblock < EXT4_NDIR_BLOCKS) | ||
874 | return 0; | ||
875 | |||
876 | lblock -= EXT4_NDIR_BLOCKS; | ||
877 | |||
878 | if (ei->i_da_metadata_calc_len && | ||
879 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
880 | ei->i_da_metadata_calc_len++; | ||
881 | return 0; | ||
882 | } | ||
883 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
884 | ei->i_da_metadata_calc_len = 1; | ||
885 | blk_bits = order_base_2(lblock); | ||
886 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
887 | } | ||
888 | |||
889 | int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) | ||
890 | { | ||
891 | int indirects; | ||
892 | |||
893 | /* if nrblocks are contiguous */ | ||
894 | if (chunk) { | ||
895 | /* | ||
896 | * With N contiguous data blocks, we need at most | ||
897 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
898 | * 2 dindirect blocks, and 1 tindirect block | ||
899 | */ | ||
900 | return DIV_ROUND_UP(nrblocks, | ||
901 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
902 | } | ||
903 | /* | ||
904 | * if nrblocks are not contiguous, worse case, each block touch | ||
905 | * a indirect block, and each indirect block touch a double indirect | ||
906 | * block, plus a triple indirect block | ||
907 | */ | ||
908 | indirects = nrblocks * 2 + 1; | ||
909 | return indirects; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
914 | * be able to restart the transaction at a conventient checkpoint to make | ||
915 | * sure we don't overflow the journal. | ||
916 | * | ||
917 | * start_transaction gets us a new handle for a truncate transaction, | ||
918 | * and extend_transaction tries to extend the existing one a bit. If | ||
919 | * extend fails, we need to propagate the failure up and restart the | ||
920 | * transaction in the top-level truncate loop. --sct | ||
921 | */ | ||
922 | static handle_t *start_transaction(struct inode *inode) | ||
923 | { | ||
924 | handle_t *result; | ||
925 | |||
926 | result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); | ||
927 | if (!IS_ERR(result)) | ||
928 | return result; | ||
929 | |||
930 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
931 | return result; | ||
932 | } | ||
933 | |||
934 | /* | ||
935 | * Try to extend this transaction for the purposes of truncation. | ||
936 | * | ||
937 | * Returns 0 if we managed to create more room. If we can't create more | ||
938 | * room, and the transaction must be restarted we return 1. | ||
939 | */ | ||
940 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
941 | { | ||
942 | if (!ext4_handle_valid(handle)) | ||
943 | return 0; | ||
944 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
945 | return 0; | ||
946 | if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) | ||
947 | return 0; | ||
948 | return 1; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Probably it should be a library function... search for first non-zero word | ||
953 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
954 | * Linus? | ||
955 | */ | ||
956 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
957 | { | ||
958 | while (p < q) | ||
959 | if (*p++) | ||
960 | return 0; | ||
961 | return 1; | ||
962 | } | ||
963 | |||
964 | /** | ||
965 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
966 | * @inode: inode in question | ||
967 | * @depth: depth of the affected branch | ||
968 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
969 | * @chain: place to store the pointers to partial indirect blocks | ||
970 | * @top: place to the (detached) top of branch | ||
971 | * | ||
972 | * This is a helper function used by ext4_truncate(). | ||
973 | * | ||
974 | * When we do truncate() we may have to clean the ends of several | ||
975 | * indirect blocks but leave the blocks themselves alive. Block is | ||
976 | * partially truncated if some data below the new i_size is referred | ||
977 | * from it (and it is on the path to the first completely truncated | ||
978 | * data block, indeed). We have to free the top of that path along | ||
979 | * with everything to the right of the path. Since no allocation | ||
980 | * past the truncation point is possible until ext4_truncate() | ||
981 | * finishes, we may safely do the latter, but top of branch may | ||
982 | * require special attention - pageout below the truncation point | ||
983 | * might try to populate it. | ||
984 | * | ||
985 | * We atomically detach the top of branch from the tree, store the | ||
986 | * block number of its root in *@top, pointers to buffer_heads of | ||
987 | * partially truncated blocks - in @chain[].bh and pointers to | ||
988 | * their last elements that should not be removed - in | ||
989 | * @chain[].p. Return value is the pointer to last filled element | ||
990 | * of @chain. | ||
991 | * | ||
992 | * The work left to caller to do the actual freeing of subtrees: | ||
993 | * a) free the subtree starting from *@top | ||
994 | * b) free the subtrees whose roots are stored in | ||
995 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
996 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
997 | * (no partially truncated stuff there). */ | ||
998 | |||
999 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
1000 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
1001 | __le32 *top) | ||
1002 | { | ||
1003 | Indirect *partial, *p; | ||
1004 | int k, err; | ||
1005 | |||
1006 | *top = 0; | ||
1007 | /* Make k index the deepest non-null offset + 1 */ | ||
1008 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
1009 | ; | ||
1010 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
1011 | /* Writer: pointers */ | ||
1012 | if (!partial) | ||
1013 | partial = chain + k-1; | ||
1014 | /* | ||
1015 | * If the branch acquired continuation since we've looked at it - | ||
1016 | * fine, it should all survive and (new) top doesn't belong to us. | ||
1017 | */ | ||
1018 | if (!partial->key && *partial->p) | ||
1019 | /* Writer: end */ | ||
1020 | goto no_top; | ||
1021 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
1022 | ; | ||
1023 | /* | ||
1024 | * OK, we've found the last block that must survive. The rest of our | ||
1025 | * branch should be detached before unlocking. However, if that rest | ||
1026 | * of branch is all ours and does not grow immediately from the inode | ||
1027 | * it's easier to cheat and just decrement partial->p. | ||
1028 | */ | ||
1029 | if (p == chain + k - 1 && p > chain) { | ||
1030 | p->p--; | ||
1031 | } else { | ||
1032 | *top = *p->p; | ||
1033 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
1034 | #if 0 | ||
1035 | *p->p = 0; | ||
1036 | #endif | ||
1037 | } | ||
1038 | /* Writer: end */ | ||
1039 | |||
1040 | while (partial > p) { | ||
1041 | brelse(partial->bh); | ||
1042 | partial--; | ||
1043 | } | ||
1044 | no_top: | ||
1045 | return partial; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Zero a number of block pointers in either an inode or an indirect block. | ||
1050 | * If we restart the transaction we must again get write access to the | ||
1051 | * indirect block for further modification. | ||
1052 | * | ||
1053 | * We release `count' blocks on disk, but (last - first) may be greater | ||
1054 | * than `count' because there can be holes in there. | ||
1055 | * | ||
1056 | * Return 0 on success, 1 on invalid block range | ||
1057 | * and < 0 on fatal error. | ||
1058 | */ | ||
1059 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
1060 | struct buffer_head *bh, | ||
1061 | ext4_fsblk_t block_to_free, | ||
1062 | unsigned long count, __le32 *first, | ||
1063 | __le32 *last) | ||
1064 | { | ||
1065 | __le32 *p; | ||
1066 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
1067 | int err; | ||
1068 | |||
1069 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
1070 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
1071 | |||
1072 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
1073 | count)) { | ||
1074 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
1075 | "blocks %llu len %lu", | ||
1076 | (unsigned long long) block_to_free, count); | ||
1077 | return 1; | ||
1078 | } | ||
1079 | |||
1080 | if (try_to_extend_transaction(handle, inode)) { | ||
1081 | if (bh) { | ||
1082 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1083 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
1084 | if (unlikely(err)) | ||
1085 | goto out_err; | ||
1086 | } | ||
1087 | err = ext4_mark_inode_dirty(handle, inode); | ||
1088 | if (unlikely(err)) | ||
1089 | goto out_err; | ||
1090 | err = ext4_truncate_restart_trans(handle, inode, | ||
1091 | ext4_blocks_for_truncate(inode)); | ||
1092 | if (unlikely(err)) | ||
1093 | goto out_err; | ||
1094 | if (bh) { | ||
1095 | BUFFER_TRACE(bh, "retaking write access"); | ||
1096 | err = ext4_journal_get_write_access(handle, bh); | ||
1097 | if (unlikely(err)) | ||
1098 | goto out_err; | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1102 | for (p = first; p < last; p++) | ||
1103 | *p = 0; | ||
1104 | |||
1105 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
1106 | return 0; | ||
1107 | out_err: | ||
1108 | ext4_std_error(inode->i_sb, err); | ||
1109 | return err; | ||
1110 | } | ||
1111 | |||
1112 | /** | ||
1113 | * ext4_free_data - free a list of data blocks | ||
1114 | * @handle: handle for this transaction | ||
1115 | * @inode: inode we are dealing with | ||
1116 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
1117 | * @first: array of block numbers | ||
1118 | * @last: points immediately past the end of array | ||
1119 | * | ||
1120 | * We are freeing all blocks referred from that array (numbers are stored as | ||
1121 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
1122 | * | ||
1123 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
1124 | * blocks are contiguous then releasing them at one time will only affect one | ||
1125 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
1126 | * actually use a lot of journal space. | ||
1127 | * | ||
1128 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
1129 | * block pointers. | ||
1130 | */ | ||
1131 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
1132 | struct buffer_head *this_bh, | ||
1133 | __le32 *first, __le32 *last) | ||
1134 | { | ||
1135 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
1136 | unsigned long count = 0; /* Number of blocks in the run */ | ||
1137 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
1138 | corresponding to | ||
1139 | block_to_free */ | ||
1140 | ext4_fsblk_t nr; /* Current block # */ | ||
1141 | __le32 *p; /* Pointer into inode/ind | ||
1142 | for current block */ | ||
1143 | int err = 0; | ||
1144 | |||
1145 | if (this_bh) { /* For indirect block */ | ||
1146 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
1147 | err = ext4_journal_get_write_access(handle, this_bh); | ||
1148 | /* Important: if we can't update the indirect pointers | ||
1149 | * to the blocks, we can't free them. */ | ||
1150 | if (err) | ||
1151 | return; | ||
1152 | } | ||
1153 | |||
1154 | for (p = first; p < last; p++) { | ||
1155 | nr = le32_to_cpu(*p); | ||
1156 | if (nr) { | ||
1157 | /* accumulate blocks to free if they're contiguous */ | ||
1158 | if (count == 0) { | ||
1159 | block_to_free = nr; | ||
1160 | block_to_free_p = p; | ||
1161 | count = 1; | ||
1162 | } else if (nr == block_to_free + count) { | ||
1163 | count++; | ||
1164 | } else { | ||
1165 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
1166 | block_to_free, count, | ||
1167 | block_to_free_p, p); | ||
1168 | if (err) | ||
1169 | break; | ||
1170 | block_to_free = nr; | ||
1171 | block_to_free_p = p; | ||
1172 | count = 1; | ||
1173 | } | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1177 | if (!err && count > 0) | ||
1178 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
1179 | count, block_to_free_p, p); | ||
1180 | if (err < 0) | ||
1181 | /* fatal error */ | ||
1182 | return; | ||
1183 | |||
1184 | if (this_bh) { | ||
1185 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
1186 | |||
1187 | /* | ||
1188 | * The buffer head should have an attached journal head at this | ||
1189 | * point. However, if the data is corrupted and an indirect | ||
1190 | * block pointed to itself, it would have been detached when | ||
1191 | * the block was cleared. Check for this instead of OOPSing. | ||
1192 | */ | ||
1193 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
1194 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
1195 | else | ||
1196 | EXT4_ERROR_INODE(inode, | ||
1197 | "circular indirect block detected at " | ||
1198 | "block %llu", | ||
1199 | (unsigned long long) this_bh->b_blocknr); | ||
1200 | } | ||
1201 | } | ||
1202 | |||
1203 | /** | ||
1204 | * ext4_free_branches - free an array of branches | ||
1205 | * @handle: JBD handle for this transaction | ||
1206 | * @inode: inode we are dealing with | ||
1207 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
1208 | * @first: array of block numbers | ||
1209 | * @last: pointer immediately past the end of array | ||
1210 | * @depth: depth of the branches to free | ||
1211 | * | ||
1212 | * We are freeing all blocks referred from these branches (numbers are | ||
1213 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
1214 | * appropriately. | ||
1215 | */ | ||
1216 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
1217 | struct buffer_head *parent_bh, | ||
1218 | __le32 *first, __le32 *last, int depth) | ||
1219 | { | ||
1220 | ext4_fsblk_t nr; | ||
1221 | __le32 *p; | ||
1222 | |||
1223 | if (ext4_handle_is_aborted(handle)) | ||
1224 | return; | ||
1225 | |||
1226 | if (depth--) { | ||
1227 | struct buffer_head *bh; | ||
1228 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1229 | p = last; | ||
1230 | while (--p >= first) { | ||
1231 | nr = le32_to_cpu(*p); | ||
1232 | if (!nr) | ||
1233 | continue; /* A hole */ | ||
1234 | |||
1235 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
1236 | nr, 1)) { | ||
1237 | EXT4_ERROR_INODE(inode, | ||
1238 | "invalid indirect mapped " | ||
1239 | "block %lu (level %d)", | ||
1240 | (unsigned long) nr, depth); | ||
1241 | break; | ||
1242 | } | ||
1243 | |||
1244 | /* Go read the buffer for the next level down */ | ||
1245 | bh = sb_bread(inode->i_sb, nr); | ||
1246 | |||
1247 | /* | ||
1248 | * A read failure? Report error and clear slot | ||
1249 | * (should be rare). | ||
1250 | */ | ||
1251 | if (!bh) { | ||
1252 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
1253 | "Read failure"); | ||
1254 | continue; | ||
1255 | } | ||
1256 | |||
1257 | /* This zaps the entire block. Bottom up. */ | ||
1258 | BUFFER_TRACE(bh, "free child branches"); | ||
1259 | ext4_free_branches(handle, inode, bh, | ||
1260 | (__le32 *) bh->b_data, | ||
1261 | (__le32 *) bh->b_data + addr_per_block, | ||
1262 | depth); | ||
1263 | brelse(bh); | ||
1264 | |||
1265 | /* | ||
1266 | * Everything below this this pointer has been | ||
1267 | * released. Now let this top-of-subtree go. | ||
1268 | * | ||
1269 | * We want the freeing of this indirect block to be | ||
1270 | * atomic in the journal with the updating of the | ||
1271 | * bitmap block which owns it. So make some room in | ||
1272 | * the journal. | ||
1273 | * | ||
1274 | * We zero the parent pointer *after* freeing its | ||
1275 | * pointee in the bitmaps, so if extend_transaction() | ||
1276 | * for some reason fails to put the bitmap changes and | ||
1277 | * the release into the same transaction, recovery | ||
1278 | * will merely complain about releasing a free block, | ||
1279 | * rather than leaking blocks. | ||
1280 | */ | ||
1281 | if (ext4_handle_is_aborted(handle)) | ||
1282 | return; | ||
1283 | if (try_to_extend_transaction(handle, inode)) { | ||
1284 | ext4_mark_inode_dirty(handle, inode); | ||
1285 | ext4_truncate_restart_trans(handle, inode, | ||
1286 | ext4_blocks_for_truncate(inode)); | ||
1287 | } | ||
1288 | |||
1289 | /* | ||
1290 | * The forget flag here is critical because if | ||
1291 | * we are journaling (and not doing data | ||
1292 | * journaling), we have to make sure a revoke | ||
1293 | * record is written to prevent the journal | ||
1294 | * replay from overwriting the (former) | ||
1295 | * indirect block if it gets reallocated as a | ||
1296 | * data block. This must happen in the same | ||
1297 | * transaction where the data blocks are | ||
1298 | * actually freed. | ||
1299 | */ | ||
1300 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
1301 | EXT4_FREE_BLOCKS_METADATA| | ||
1302 | EXT4_FREE_BLOCKS_FORGET); | ||
1303 | |||
1304 | if (parent_bh) { | ||
1305 | /* | ||
1306 | * The block which we have just freed is | ||
1307 | * pointed to by an indirect block: journal it | ||
1308 | */ | ||
1309 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
1310 | if (!ext4_journal_get_write_access(handle, | ||
1311 | parent_bh)){ | ||
1312 | *p = 0; | ||
1313 | BUFFER_TRACE(parent_bh, | ||
1314 | "call ext4_handle_dirty_metadata"); | ||
1315 | ext4_handle_dirty_metadata(handle, | ||
1316 | inode, | ||
1317 | parent_bh); | ||
1318 | } | ||
1319 | } | ||
1320 | } | ||
1321 | } else { | ||
1322 | /* We have reached the bottom of the tree. */ | ||
1323 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
1324 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | void ext4_ind_truncate(struct inode *inode) | ||
1329 | { | ||
1330 | handle_t *handle; | ||
1331 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1332 | __le32 *i_data = ei->i_data; | ||
1333 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1334 | struct address_space *mapping = inode->i_mapping; | ||
1335 | ext4_lblk_t offsets[4]; | ||
1336 | Indirect chain[4]; | ||
1337 | Indirect *partial; | ||
1338 | __le32 nr = 0; | ||
1339 | int n = 0; | ||
1340 | ext4_lblk_t last_block, max_block; | ||
1341 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
1342 | |||
1343 | handle = start_transaction(inode); | ||
1344 | if (IS_ERR(handle)) | ||
1345 | return; /* AKPM: return what? */ | ||
1346 | |||
1347 | last_block = (inode->i_size + blocksize-1) | ||
1348 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1349 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
1350 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1351 | |||
1352 | if (inode->i_size & (blocksize - 1)) | ||
1353 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
1354 | goto out_stop; | ||
1355 | |||
1356 | if (last_block != max_block) { | ||
1357 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
1358 | if (n == 0) | ||
1359 | goto out_stop; /* error */ | ||
1360 | } | ||
1361 | |||
1362 | /* | ||
1363 | * OK. This truncate is going to happen. We add the inode to the | ||
1364 | * orphan list, so that if this truncate spans multiple transactions, | ||
1365 | * and we crash, we will resume the truncate when the filesystem | ||
1366 | * recovers. It also marks the inode dirty, to catch the new size. | ||
1367 | * | ||
1368 | * Implication: the file must always be in a sane, consistent | ||
1369 | * truncatable state while each transaction commits. | ||
1370 | */ | ||
1371 | if (ext4_orphan_add(handle, inode)) | ||
1372 | goto out_stop; | ||
1373 | |||
1374 | /* | ||
1375 | * From here we block out all ext4_get_block() callers who want to | ||
1376 | * modify the block allocation tree. | ||
1377 | */ | ||
1378 | down_write(&ei->i_data_sem); | ||
1379 | |||
1380 | ext4_discard_preallocations(inode); | ||
1381 | |||
1382 | /* | ||
1383 | * The orphan list entry will now protect us from any crash which | ||
1384 | * occurs before the truncate completes, so it is now safe to propagate | ||
1385 | * the new, shorter inode size (held for now in i_size) into the | ||
1386 | * on-disk inode. We do this via i_disksize, which is the value which | ||
1387 | * ext4 *really* writes onto the disk inode. | ||
1388 | */ | ||
1389 | ei->i_disksize = inode->i_size; | ||
1390 | |||
1391 | if (last_block == max_block) { | ||
1392 | /* | ||
1393 | * It is unnecessary to free any data blocks if last_block is | ||
1394 | * equal to the indirect block limit. | ||
1395 | */ | ||
1396 | goto out_unlock; | ||
1397 | } else if (n == 1) { /* direct blocks */ | ||
1398 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
1399 | i_data + EXT4_NDIR_BLOCKS); | ||
1400 | goto do_indirects; | ||
1401 | } | ||
1402 | |||
1403 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
1404 | /* Kill the top of shared branch (not detached) */ | ||
1405 | if (nr) { | ||
1406 | if (partial == chain) { | ||
1407 | /* Shared branch grows from the inode */ | ||
1408 | ext4_free_branches(handle, inode, NULL, | ||
1409 | &nr, &nr+1, (chain+n-1) - partial); | ||
1410 | *partial->p = 0; | ||
1411 | /* | ||
1412 | * We mark the inode dirty prior to restart, | ||
1413 | * and prior to stop. No need for it here. | ||
1414 | */ | ||
1415 | } else { | ||
1416 | /* Shared branch grows from an indirect block */ | ||
1417 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
1418 | ext4_free_branches(handle, inode, partial->bh, | ||
1419 | partial->p, | ||
1420 | partial->p+1, (chain+n-1) - partial); | ||
1421 | } | ||
1422 | } | ||
1423 | /* Clear the ends of indirect blocks on the shared branch */ | ||
1424 | while (partial > chain) { | ||
1425 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
1426 | (__le32*)partial->bh->b_data+addr_per_block, | ||
1427 | (chain+n-1) - partial); | ||
1428 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1429 | brelse(partial->bh); | ||
1430 | partial--; | ||
1431 | } | ||
1432 | do_indirects: | ||
1433 | /* Kill the remaining (whole) subtrees */ | ||
1434 | switch (offsets[0]) { | ||
1435 | default: | ||
1436 | nr = i_data[EXT4_IND_BLOCK]; | ||
1437 | if (nr) { | ||
1438 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
1439 | i_data[EXT4_IND_BLOCK] = 0; | ||
1440 | } | ||
1441 | case EXT4_IND_BLOCK: | ||
1442 | nr = i_data[EXT4_DIND_BLOCK]; | ||
1443 | if (nr) { | ||
1444 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
1445 | i_data[EXT4_DIND_BLOCK] = 0; | ||
1446 | } | ||
1447 | case EXT4_DIND_BLOCK: | ||
1448 | nr = i_data[EXT4_TIND_BLOCK]; | ||
1449 | if (nr) { | ||
1450 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
1451 | i_data[EXT4_TIND_BLOCK] = 0; | ||
1452 | } | ||
1453 | case EXT4_TIND_BLOCK: | ||
1454 | ; | ||
1455 | } | ||
1456 | |||
1457 | out_unlock: | ||
1458 | up_write(&ei->i_data_sem); | ||
1459 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
1460 | ext4_mark_inode_dirty(handle, inode); | ||
1461 | |||
1462 | /* | ||
1463 | * In a multi-transaction truncate, we only make the final transaction | ||
1464 | * synchronous | ||
1465 | */ | ||
1466 | if (IS_SYNC(inode)) | ||
1467 | ext4_handle_sync(handle); | ||
1468 | out_stop: | ||
1469 | /* | ||
1470 | * If this was a simple ftruncate(), and the file will remain alive | ||
1471 | * then we need to clear up the orphan record which we created above. | ||
1472 | * However, if this was a real unlink then we were called by | ||
1473 | * ext4_delete_inode(), and we allow that function to clean up the | ||
1474 | * orphan info for us. | ||
1475 | */ | ||
1476 | if (inode->i_nlink) | ||
1477 | ext4_orphan_del(handle, inode); | ||
1478 | |||
1479 | ext4_journal_stop(handle); | ||
1480 | trace_ext4_truncate_exit(inode); | ||
1481 | } | ||
1482 | |||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e5191f9f398..d47264cafee0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -12,10 +12,6 @@ | |||
12 | * | 12 | * |
13 | * Copyright (C) 1991, 1992 Linus Torvalds | 13 | * Copyright (C) 1991, 1992 Linus Torvalds |
14 | * | 14 | * |
15 | * Goal-directed block allocation by Stephen Tweedie | ||
16 | * (sct@redhat.com), 1993, 1998 | ||
17 | * Big-endian to little-endian byte-swapping/bitmaps by | ||
18 | * David S. Miller (davem@caip.rutgers.edu), 1995 | ||
19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek | 15 | * 64-bit file support on 64-bit platforms by Jakub Jelinek |
20 | * (jj@sunsite.ms.mff.cuni.cz) | 16 | * (jj@sunsite.ms.mff.cuni.cz) |
21 | * | 17 | * |
@@ -47,6 +43,7 @@ | |||
47 | #include "xattr.h" | 43 | #include "xattr.h" |
48 | #include "acl.h" | 44 | #include "acl.h" |
49 | #include "ext4_extents.h" | 45 | #include "ext4_extents.h" |
46 | #include "truncate.h" | ||
50 | 47 | ||
51 | #include <trace/events/ext4.h> | 48 | #include <trace/events/ext4.h> |
52 | 49 | ||
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Work out how many blocks we need to proceed with the next chunk of a | ||
93 | * truncate transaction. | ||
94 | */ | ||
95 | static unsigned long blocks_for_truncate(struct inode *inode) | ||
96 | { | ||
97 | ext4_lblk_t needed; | ||
98 | |||
99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
100 | |||
101 | /* Give ourselves just enough room to cope with inodes in which | ||
102 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
103 | * which resulted in random data in an inode which looked enough | ||
104 | * like a regular file for ext4 to try to delete it. Things | ||
105 | * will go a bit crazy if that happens, but at least we should | ||
106 | * try not to panic the whole kernel. */ | ||
107 | if (needed < 2) | ||
108 | needed = 2; | ||
109 | |||
110 | /* But we need to bound the transaction so we don't overflow the | ||
111 | * journal. */ | ||
112 | if (needed > EXT4_MAX_TRANS_DATA) | ||
113 | needed = EXT4_MAX_TRANS_DATA; | ||
114 | |||
115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
120 | * be able to restart the transaction at a conventient checkpoint to make | ||
121 | * sure we don't overflow the journal. | ||
122 | * | ||
123 | * start_transaction gets us a new handle for a truncate transaction, | ||
124 | * and extend_transaction tries to extend the existing one a bit. If | ||
125 | * extend fails, we need to propagate the failure up and restart the | ||
126 | * transaction in the top-level truncate loop. --sct | ||
127 | */ | ||
128 | static handle_t *start_transaction(struct inode *inode) | ||
129 | { | ||
130 | handle_t *result; | ||
131 | |||
132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); | ||
133 | if (!IS_ERR(result)) | ||
134 | return result; | ||
135 | |||
136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
137 | return result; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Try to extend this transaction for the purposes of truncation. | ||
142 | * | ||
143 | * Returns 0 if we managed to create more room. If we can't create more | ||
144 | * room, and the transaction must be restarted we return 1. | ||
145 | */ | ||
146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
147 | { | ||
148 | if (!ext4_handle_valid(handle)) | ||
149 | return 0; | ||
150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
151 | return 0; | ||
152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | ||
153 | return 0; | ||
154 | return 1; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Restart the transaction associated with *handle. This does a commit, | 89 | * Restart the transaction associated with *handle. This does a commit, |
159 | * so before we call here everything must be consistently dirtied against | 90 | * so before we call here everything must be consistently dirtied against |
160 | * this transaction. | 91 | * this transaction. |
@@ -190,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) | |||
190 | 121 | ||
191 | trace_ext4_evict_inode(inode); | 122 | trace_ext4_evict_inode(inode); |
192 | if (inode->i_nlink) { | 123 | if (inode->i_nlink) { |
124 | /* | ||
125 | * When journalling data dirty buffers are tracked only in the | ||
126 | * journal. So although mm thinks everything is clean and | ||
127 | * ready for reaping the inode might still have some pages to | ||
128 | * write in the running transaction or waiting to be | ||
129 | * checkpointed. Thus calling jbd2_journal_invalidatepage() | ||
130 | * (via truncate_inode_pages()) to discard these buffers can | ||
131 | * cause data loss. Also even if we did not discard these | ||
132 | * buffers, we would have no way to find them after the inode | ||
133 | * is reaped and thus user could see stale data if he tries to | ||
134 | * read them before the transaction is checkpointed. So be | ||
135 | * careful and force everything to disk here... We use | ||
136 | * ei->i_datasync_tid to store the newest transaction | ||
137 | * containing inode's data. | ||
138 | * | ||
139 | * Note that directories do not have this problem because they | ||
140 | * don't use page cache. | ||
141 | */ | ||
142 | if (ext4_should_journal_data(inode) && | ||
143 | (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { | ||
144 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
145 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; | ||
146 | |||
147 | jbd2_log_start_commit(journal, commit_tid); | ||
148 | jbd2_log_wait_commit(journal, commit_tid); | ||
149 | filemap_write_and_wait(&inode->i_data); | ||
150 | } | ||
193 | truncate_inode_pages(&inode->i_data, 0); | 151 | truncate_inode_pages(&inode->i_data, 0); |
194 | goto no_delete; | 152 | goto no_delete; |
195 | } | 153 | } |
@@ -204,7 +162,7 @@ void ext4_evict_inode(struct inode *inode) | |||
204 | if (is_bad_inode(inode)) | 162 | if (is_bad_inode(inode)) |
205 | goto no_delete; | 163 | goto no_delete; |
206 | 164 | ||
207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | 165 | handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); |
208 | if (IS_ERR(handle)) { | 166 | if (IS_ERR(handle)) { |
209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | 167 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); |
210 | /* | 168 | /* |
@@ -277,793 +235,6 @@ no_delete: | |||
277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ | 235 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
278 | } | 236 | } |
279 | 237 | ||
280 | typedef struct { | ||
281 | __le32 *p; | ||
282 | __le32 key; | ||
283 | struct buffer_head *bh; | ||
284 | } Indirect; | ||
285 | |||
286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
287 | { | ||
288 | p->key = *(p->p = v); | ||
289 | p->bh = bh; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * ext4_block_to_path - parse the block number into array of offsets | ||
294 | * @inode: inode in question (we are only interested in its superblock) | ||
295 | * @i_block: block number to be parsed | ||
296 | * @offsets: array to store the offsets in | ||
297 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
298 | * followed (on disk) by an indirect block. | ||
299 | * | ||
300 | * To store the locations of file's data ext4 uses a data structure common | ||
301 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
302 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
303 | * This function translates the block number into path in that tree - | ||
304 | * return value is the path length and @offsets[n] is the offset of | ||
305 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
306 | * (negative or too large) warning is printed and zero returned. | ||
307 | * | ||
308 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
309 | * we need to know is the capacity of indirect blocks (taken from the | ||
310 | * inode->i_sb). | ||
311 | */ | ||
312 | |||
313 | /* | ||
314 | * Portability note: the last comparison (check that we fit into triple | ||
315 | * indirect block) is spelled differently, because otherwise on an | ||
316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
317 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
319 | * i_block would have to be negative in the very beginning, so we would not | ||
320 | * get there at all. | ||
321 | */ | ||
322 | |||
323 | static int ext4_block_to_path(struct inode *inode, | ||
324 | ext4_lblk_t i_block, | ||
325 | ext4_lblk_t offsets[4], int *boundary) | ||
326 | { | ||
327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
329 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
330 | indirect_blocks = ptrs, | ||
331 | double_blocks = (1 << (ptrs_bits * 2)); | ||
332 | int n = 0; | ||
333 | int final = 0; | ||
334 | |||
335 | if (i_block < direct_blocks) { | ||
336 | offsets[n++] = i_block; | ||
337 | final = direct_blocks; | ||
338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
339 | offsets[n++] = EXT4_IND_BLOCK; | ||
340 | offsets[n++] = i_block; | ||
341 | final = ptrs; | ||
342 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
343 | offsets[n++] = EXT4_DIND_BLOCK; | ||
344 | offsets[n++] = i_block >> ptrs_bits; | ||
345 | offsets[n++] = i_block & (ptrs - 1); | ||
346 | final = ptrs; | ||
347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
348 | offsets[n++] = EXT4_TIND_BLOCK; | ||
349 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
351 | offsets[n++] = i_block & (ptrs - 1); | ||
352 | final = ptrs; | ||
353 | } else { | ||
354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
355 | i_block + direct_blocks + | ||
356 | indirect_blocks + double_blocks, inode->i_ino); | ||
357 | } | ||
358 | if (boundary) | ||
359 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
360 | return n; | ||
361 | } | ||
362 | |||
363 | static int __ext4_check_blockref(const char *function, unsigned int line, | ||
364 | struct inode *inode, | ||
365 | __le32 *p, unsigned int max) | ||
366 | { | ||
367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
368 | __le32 *bref = p; | ||
369 | unsigned int blk; | ||
370 | |||
371 | while (bref < p+max) { | ||
372 | blk = le32_to_cpu(*bref++); | ||
373 | if (blk && | ||
374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
375 | blk, 1))) { | ||
376 | es->s_last_error_block = cpu_to_le64(blk); | ||
377 | ext4_error_inode(inode, function, line, blk, | ||
378 | "invalid block"); | ||
379 | return -EIO; | ||
380 | } | ||
381 | } | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | |||
386 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
387 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
388 | (__le32 *)(bh)->b_data, \ | ||
389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
390 | |||
391 | #define ext4_check_inode_blockref(inode) \ | ||
392 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
393 | EXT4_I(inode)->i_data, \ | ||
394 | EXT4_NDIR_BLOCKS) | ||
395 | |||
396 | /** | ||
397 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
398 | * @inode: inode in question | ||
399 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
400 | * @offsets: offsets of pointers in inode/indirect blocks | ||
401 | * @chain: place to store the result | ||
402 | * @err: here we store the error value | ||
403 | * | ||
404 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
405 | * if everything went OK or the pointer to the last filled triple | ||
406 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
407 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
409 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
411 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
412 | * numbers of the chain, addresses they were taken from (and where we can | ||
413 | * verify that chain did not change) and buffer_heads hosting these | ||
414 | * numbers. | ||
415 | * | ||
416 | * Function stops when it stumbles upon zero pointer (absent block) | ||
417 | * (pointer to last triple returned, *@err == 0) | ||
418 | * or when it gets an IO error reading an indirect block | ||
419 | * (ditto, *@err == -EIO) | ||
420 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
421 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
422 | * | ||
423 | * Need to be called with | ||
424 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
425 | */ | ||
426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
427 | ext4_lblk_t *offsets, | ||
428 | Indirect chain[4], int *err) | ||
429 | { | ||
430 | struct super_block *sb = inode->i_sb; | ||
431 | Indirect *p = chain; | ||
432 | struct buffer_head *bh; | ||
433 | |||
434 | *err = 0; | ||
435 | /* i_data is not going away, no lock needed */ | ||
436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
437 | if (!p->key) | ||
438 | goto no_block; | ||
439 | while (--depth) { | ||
440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
441 | if (unlikely(!bh)) | ||
442 | goto failure; | ||
443 | |||
444 | if (!bh_uptodate_or_lock(bh)) { | ||
445 | if (bh_submit_read(bh) < 0) { | ||
446 | put_bh(bh); | ||
447 | goto failure; | ||
448 | } | ||
449 | /* validate block references */ | ||
450 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
451 | put_bh(bh); | ||
452 | goto failure; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
457 | /* Reader: end */ | ||
458 | if (!p->key) | ||
459 | goto no_block; | ||
460 | } | ||
461 | return NULL; | ||
462 | |||
463 | failure: | ||
464 | *err = -EIO; | ||
465 | no_block: | ||
466 | return p; | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * ext4_find_near - find a place for allocation with sufficient locality | ||
471 | * @inode: owner | ||
472 | * @ind: descriptor of indirect block. | ||
473 | * | ||
474 | * This function returns the preferred place for block allocation. | ||
475 | * It is used when heuristic for sequential allocation fails. | ||
476 | * Rules are: | ||
477 | * + if there is a block to the left of our position - allocate near it. | ||
478 | * + if pointer will live in indirect block - allocate near that block. | ||
479 | * + if pointer will live in inode - allocate in the same | ||
480 | * cylinder group. | ||
481 | * | ||
482 | * In the latter case we colour the starting block by the callers PID to | ||
483 | * prevent it from clashing with concurrent allocations for a different inode | ||
484 | * in the same block group. The PID is used here so that functionally related | ||
485 | * files will be close-by on-disk. | ||
486 | * | ||
487 | * Caller must make sure that @ind is valid and will stay that way. | ||
488 | */ | ||
489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
490 | { | ||
491 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
493 | __le32 *p; | ||
494 | ext4_fsblk_t bg_start; | ||
495 | ext4_fsblk_t last_block; | ||
496 | ext4_grpblk_t colour; | ||
497 | ext4_group_t block_group; | ||
498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
499 | |||
500 | /* Try to find previous block */ | ||
501 | for (p = ind->p - 1; p >= start; p--) { | ||
502 | if (*p) | ||
503 | return le32_to_cpu(*p); | ||
504 | } | ||
505 | |||
506 | /* No such thing, so let's try location of indirect block */ | ||
507 | if (ind->bh) | ||
508 | return ind->bh->b_blocknr; | ||
509 | |||
510 | /* | ||
511 | * It is going to be referred to from the inode itself? OK, just put it | ||
512 | * into the same cylinder group then. | ||
513 | */ | ||
514 | block_group = ei->i_block_group; | ||
515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
516 | block_group &= ~(flex_size-1); | ||
517 | if (S_ISREG(inode->i_mode)) | ||
518 | block_group++; | ||
519 | } | ||
520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
522 | |||
523 | /* | ||
524 | * If we are doing delayed allocation, we don't need take | ||
525 | * colour into account. | ||
526 | */ | ||
527 | if (test_opt(inode->i_sb, DELALLOC)) | ||
528 | return bg_start; | ||
529 | |||
530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
531 | colour = (current->pid % 16) * | ||
532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
533 | else | ||
534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
535 | return bg_start + colour; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * ext4_find_goal - find a preferred place for allocation. | ||
540 | * @inode: owner | ||
541 | * @block: block we want | ||
542 | * @partial: pointer to the last triple within a chain | ||
543 | * | ||
544 | * Normally this function find the preferred place for block allocation, | ||
545 | * returns it. | ||
546 | * Because this is only used for non-extent files, we limit the block nr | ||
547 | * to 32 bits. | ||
548 | */ | ||
549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
550 | Indirect *partial) | ||
551 | { | ||
552 | ext4_fsblk_t goal; | ||
553 | |||
554 | /* | ||
555 | * XXX need to get goal block from mballoc's data structures | ||
556 | */ | ||
557 | |||
558 | goal = ext4_find_near(inode, partial); | ||
559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
560 | return goal; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
565 | * of direct blocks need to be allocated for the given branch. | ||
566 | * | ||
567 | * @branch: chain of indirect blocks | ||
568 | * @k: number of blocks need for indirect blocks | ||
569 | * @blks: number of data blocks to be mapped. | ||
570 | * @blocks_to_boundary: the offset in the indirect block | ||
571 | * | ||
572 | * return the total number of blocks to be allocate, including the | ||
573 | * direct and indirect blocks. | ||
574 | */ | ||
575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
576 | int blocks_to_boundary) | ||
577 | { | ||
578 | unsigned int count = 0; | ||
579 | |||
580 | /* | ||
581 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
582 | * then it's clear blocks on that path have not allocated | ||
583 | */ | ||
584 | if (k > 0) { | ||
585 | /* right now we don't handle cross boundary allocation */ | ||
586 | if (blks < blocks_to_boundary + 1) | ||
587 | count += blks; | ||
588 | else | ||
589 | count += blocks_to_boundary + 1; | ||
590 | return count; | ||
591 | } | ||
592 | |||
593 | count++; | ||
594 | while (count < blks && count <= blocks_to_boundary && | ||
595 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
596 | count++; | ||
597 | } | ||
598 | return count; | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
603 | * @handle: handle for this transaction | ||
604 | * @inode: inode which needs allocated blocks | ||
605 | * @iblock: the logical block to start allocated at | ||
606 | * @goal: preferred physical block of allocation | ||
607 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
608 | * blocks | ||
609 | * @blks: number of desired blocks | ||
610 | * @new_blocks: on return it will store the new block numbers for | ||
611 | * the indirect blocks(if needed) and the first direct block, | ||
612 | * @err: on return it will store the error code | ||
613 | * | ||
614 | * This function will return the number of blocks allocated as | ||
615 | * requested by the passed-in parameters. | ||
616 | */ | ||
617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
618 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
619 | int indirect_blks, int blks, | ||
620 | ext4_fsblk_t new_blocks[4], int *err) | ||
621 | { | ||
622 | struct ext4_allocation_request ar; | ||
623 | int target, i; | ||
624 | unsigned long count = 0, blk_allocated = 0; | ||
625 | int index = 0; | ||
626 | ext4_fsblk_t current_block = 0; | ||
627 | int ret = 0; | ||
628 | |||
629 | /* | ||
630 | * Here we try to allocate the requested multiple blocks at once, | ||
631 | * on a best-effort basis. | ||
632 | * To build a branch, we should allocate blocks for | ||
633 | * the indirect blocks(if not allocated yet), and at least | ||
634 | * the first direct block of this branch. That's the | ||
635 | * minimum number of blocks need to allocate(required) | ||
636 | */ | ||
637 | /* first we try to allocate the indirect blocks */ | ||
638 | target = indirect_blks; | ||
639 | while (target > 0) { | ||
640 | count = target; | ||
641 | /* allocating blocks for indirect blocks and direct blocks */ | ||
642 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
643 | 0, &count, err); | ||
644 | if (*err) | ||
645 | goto failed_out; | ||
646 | |||
647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
648 | EXT4_ERROR_INODE(inode, | ||
649 | "current_block %llu + count %lu > %d!", | ||
650 | current_block, count, | ||
651 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
652 | *err = -EIO; | ||
653 | goto failed_out; | ||
654 | } | ||
655 | |||
656 | target -= count; | ||
657 | /* allocate blocks for indirect blocks */ | ||
658 | while (index < indirect_blks && count) { | ||
659 | new_blocks[index++] = current_block++; | ||
660 | count--; | ||
661 | } | ||
662 | if (count > 0) { | ||
663 | /* | ||
664 | * save the new block number | ||
665 | * for the first direct block | ||
666 | */ | ||
667 | new_blocks[index] = current_block; | ||
668 | printk(KERN_INFO "%s returned more blocks than " | ||
669 | "requested\n", __func__); | ||
670 | WARN_ON(1); | ||
671 | break; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | target = blks - count ; | ||
676 | blk_allocated = count; | ||
677 | if (!target) | ||
678 | goto allocated; | ||
679 | /* Now allocate data blocks */ | ||
680 | memset(&ar, 0, sizeof(ar)); | ||
681 | ar.inode = inode; | ||
682 | ar.goal = goal; | ||
683 | ar.len = target; | ||
684 | ar.logical = iblock; | ||
685 | if (S_ISREG(inode->i_mode)) | ||
686 | /* enable in-core preallocation only for regular files */ | ||
687 | ar.flags = EXT4_MB_HINT_DATA; | ||
688 | |||
689 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
691 | EXT4_ERROR_INODE(inode, | ||
692 | "current_block %llu + ar.len %d > %d!", | ||
693 | current_block, ar.len, | ||
694 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
695 | *err = -EIO; | ||
696 | goto failed_out; | ||
697 | } | ||
698 | |||
699 | if (*err && (target == blks)) { | ||
700 | /* | ||
701 | * if the allocation failed and we didn't allocate | ||
702 | * any blocks before | ||
703 | */ | ||
704 | goto failed_out; | ||
705 | } | ||
706 | if (!*err) { | ||
707 | if (target == blks) { | ||
708 | /* | ||
709 | * save the new block number | ||
710 | * for the first direct block | ||
711 | */ | ||
712 | new_blocks[index] = current_block; | ||
713 | } | ||
714 | blk_allocated += ar.len; | ||
715 | } | ||
716 | allocated: | ||
717 | /* total number of blocks allocated for direct blocks */ | ||
718 | ret = blk_allocated; | ||
719 | *err = 0; | ||
720 | return ret; | ||
721 | failed_out: | ||
722 | for (i = 0; i < index; i++) | ||
723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
724 | return ret; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
729 | * @handle: handle for this transaction | ||
730 | * @inode: owner | ||
731 | * @indirect_blks: number of allocated indirect blocks | ||
732 | * @blks: number of allocated direct blocks | ||
733 | * @goal: preferred place for allocation | ||
734 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
735 | * @branch: place to store the chain in. | ||
736 | * | ||
737 | * This function allocates blocks, zeroes out all but the last one, | ||
738 | * links them into chain and (if we are synchronous) writes them to disk. | ||
739 | * In other words, it prepares a branch that can be spliced onto the | ||
740 | * inode. It stores the information about that chain in the branch[], in | ||
741 | * the same format as ext4_get_branch() would do. We are calling it after | ||
742 | * we had read the existing part of chain and partial points to the last | ||
743 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
744 | * picture as after the successful ext4_get_block(), except that in one | ||
745 | * place chain is disconnected - *branch->p is still zero (we did not | ||
746 | * set the last link), but branch->key contains the number that should | ||
747 | * be placed into *branch->p to fill that gap. | ||
748 | * | ||
749 | * If allocation fails we free all blocks we've allocated (and forget | ||
750 | * their buffer_heads) and return the error value the from failed | ||
751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
752 | * as described above and return 0. | ||
753 | */ | ||
754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
755 | ext4_lblk_t iblock, int indirect_blks, | ||
756 | int *blks, ext4_fsblk_t goal, | ||
757 | ext4_lblk_t *offsets, Indirect *branch) | ||
758 | { | ||
759 | int blocksize = inode->i_sb->s_blocksize; | ||
760 | int i, n = 0; | ||
761 | int err = 0; | ||
762 | struct buffer_head *bh; | ||
763 | int num; | ||
764 | ext4_fsblk_t new_blocks[4]; | ||
765 | ext4_fsblk_t current_block; | ||
766 | |||
767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
768 | *blks, new_blocks, &err); | ||
769 | if (err) | ||
770 | return err; | ||
771 | |||
772 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
773 | /* | ||
774 | * metadata blocks and data blocks are allocated. | ||
775 | */ | ||
776 | for (n = 1; n <= indirect_blks; n++) { | ||
777 | /* | ||
778 | * Get buffer_head for parent block, zero it out | ||
779 | * and set the pointer to new one, then send | ||
780 | * parent to disk. | ||
781 | */ | ||
782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
783 | if (unlikely(!bh)) { | ||
784 | err = -EIO; | ||
785 | goto failed; | ||
786 | } | ||
787 | |||
788 | branch[n].bh = bh; | ||
789 | lock_buffer(bh); | ||
790 | BUFFER_TRACE(bh, "call get_create_access"); | ||
791 | err = ext4_journal_get_create_access(handle, bh); | ||
792 | if (err) { | ||
793 | /* Don't brelse(bh) here; it's done in | ||
794 | * ext4_journal_forget() below */ | ||
795 | unlock_buffer(bh); | ||
796 | goto failed; | ||
797 | } | ||
798 | |||
799 | memset(bh->b_data, 0, blocksize); | ||
800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
801 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
802 | *branch[n].p = branch[n].key; | ||
803 | if (n == indirect_blks) { | ||
804 | current_block = new_blocks[n]; | ||
805 | /* | ||
806 | * End of chain, update the last new metablock of | ||
807 | * the chain to point to the new allocated | ||
808 | * data blocks numbers | ||
809 | */ | ||
810 | for (i = 1; i < num; i++) | ||
811 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
812 | } | ||
813 | BUFFER_TRACE(bh, "marking uptodate"); | ||
814 | set_buffer_uptodate(bh); | ||
815 | unlock_buffer(bh); | ||
816 | |||
817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
818 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
819 | if (err) | ||
820 | goto failed; | ||
821 | } | ||
822 | *blks = num; | ||
823 | return err; | ||
824 | failed: | ||
825 | /* Allocation failed, free what we already allocated */ | ||
826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
827 | for (i = 1; i <= n ; i++) { | ||
828 | /* | ||
829 | * branch[i].bh is newly allocated, so there is no | ||
830 | * need to revoke the block, which is why we don't | ||
831 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
832 | */ | ||
833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
834 | EXT4_FREE_BLOCKS_FORGET); | ||
835 | } | ||
836 | for (i = n+1; i < indirect_blks; i++) | ||
837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
838 | |||
839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
840 | |||
841 | return err; | ||
842 | } | ||
843 | |||
844 | /** | ||
845 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
846 | * @handle: handle for this transaction | ||
847 | * @inode: owner | ||
848 | * @block: (logical) number of block we are adding | ||
849 | * @chain: chain of indirect blocks (with a missing link - see | ||
850 | * ext4_alloc_branch) | ||
851 | * @where: location of missing link | ||
852 | * @num: number of indirect blocks we are adding | ||
853 | * @blks: number of direct blocks we are adding | ||
854 | * | ||
855 | * This function fills the missing link and does all housekeeping needed in | ||
856 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
857 | * chain to new block and return 0. | ||
858 | */ | ||
859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
860 | ext4_lblk_t block, Indirect *where, int num, | ||
861 | int blks) | ||
862 | { | ||
863 | int i; | ||
864 | int err = 0; | ||
865 | ext4_fsblk_t current_block; | ||
866 | |||
867 | /* | ||
868 | * If we're splicing into a [td]indirect block (as opposed to the | ||
869 | * inode) then we need to get write access to the [td]indirect block | ||
870 | * before the splice. | ||
871 | */ | ||
872 | if (where->bh) { | ||
873 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
874 | err = ext4_journal_get_write_access(handle, where->bh); | ||
875 | if (err) | ||
876 | goto err_out; | ||
877 | } | ||
878 | /* That's it */ | ||
879 | |||
880 | *where->p = where->key; | ||
881 | |||
882 | /* | ||
883 | * Update the host buffer_head or inode to point to more just allocated | ||
884 | * direct blocks blocks | ||
885 | */ | ||
886 | if (num == 0 && blks > 1) { | ||
887 | current_block = le32_to_cpu(where->key) + 1; | ||
888 | for (i = 1; i < blks; i++) | ||
889 | *(where->p + i) = cpu_to_le32(current_block++); | ||
890 | } | ||
891 | |||
892 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
893 | /* had we spliced it onto indirect block? */ | ||
894 | if (where->bh) { | ||
895 | /* | ||
896 | * If we spliced it onto an indirect block, we haven't | ||
897 | * altered the inode. Note however that if it is being spliced | ||
898 | * onto an indirect block at the very end of the file (the | ||
899 | * file is growing) then we *will* alter the inode to reflect | ||
900 | * the new i_size. But that is not done here - it is done in | ||
901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
902 | */ | ||
903 | jbd_debug(5, "splicing indirect only\n"); | ||
904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
906 | if (err) | ||
907 | goto err_out; | ||
908 | } else { | ||
909 | /* | ||
910 | * OK, we spliced it into the inode itself on a direct block. | ||
911 | */ | ||
912 | ext4_mark_inode_dirty(handle, inode); | ||
913 | jbd_debug(5, "splicing direct\n"); | ||
914 | } | ||
915 | return err; | ||
916 | |||
917 | err_out: | ||
918 | for (i = 1; i <= num; i++) { | ||
919 | /* | ||
920 | * branch[i].bh is newly allocated, so there is no | ||
921 | * need to revoke the block, which is why we don't | ||
922 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
923 | */ | ||
924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
925 | EXT4_FREE_BLOCKS_FORGET); | ||
926 | } | ||
927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
928 | blks, 0); | ||
929 | |||
930 | return err; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
935 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
936 | * scheme) for ext4_map_blocks(). | ||
937 | * | ||
938 | * Allocation strategy is simple: if we have to allocate something, we will | ||
939 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
940 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
941 | * required, recheck the path, free and repeat if check fails, otherwise | ||
942 | * set the last missing link (that will protect us from any truncate-generated | ||
943 | * removals - all blocks on the path are immune now) and possibly force the | ||
944 | * write on the parent block. | ||
945 | * That has a nice additional property: no special recovery from the failed | ||
946 | * allocations is needed - we simply release blocks and do not touch anything | ||
947 | * reachable from inode. | ||
948 | * | ||
949 | * `handle' can be NULL if create == 0. | ||
950 | * | ||
951 | * return > 0, # of blocks mapped or allocated. | ||
952 | * return = 0, if plain lookup failed. | ||
953 | * return < 0, error case. | ||
954 | * | ||
955 | * The ext4_ind_get_blocks() function should be called with | ||
956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
959 | * blocks. | ||
960 | */ | ||
961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
962 | struct ext4_map_blocks *map, | ||
963 | int flags) | ||
964 | { | ||
965 | int err = -EIO; | ||
966 | ext4_lblk_t offsets[4]; | ||
967 | Indirect chain[4]; | ||
968 | Indirect *partial; | ||
969 | ext4_fsblk_t goal; | ||
970 | int indirect_blks; | ||
971 | int blocks_to_boundary = 0; | ||
972 | int depth; | ||
973 | int count = 0; | ||
974 | ext4_fsblk_t first_block = 0; | ||
975 | |||
976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
980 | &blocks_to_boundary); | ||
981 | |||
982 | if (depth == 0) | ||
983 | goto out; | ||
984 | |||
985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
986 | |||
987 | /* Simplest case - block found, no allocation needed */ | ||
988 | if (!partial) { | ||
989 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
990 | count++; | ||
991 | /*map more blocks*/ | ||
992 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
993 | ext4_fsblk_t blk; | ||
994 | |||
995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
996 | |||
997 | if (blk == first_block + count) | ||
998 | count++; | ||
999 | else | ||
1000 | break; | ||
1001 | } | ||
1002 | goto got_it; | ||
1003 | } | ||
1004 | |||
1005 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
1007 | goto cleanup; | ||
1008 | |||
1009 | /* | ||
1010 | * Okay, we need to do block allocation. | ||
1011 | */ | ||
1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
1013 | |||
1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
1015 | indirect_blks = (chain + depth) - partial - 1; | ||
1016 | |||
1017 | /* | ||
1018 | * Next look up the indirect map to count the totoal number of | ||
1019 | * direct blocks to allocate for this branch. | ||
1020 | */ | ||
1021 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
1022 | map->m_len, blocks_to_boundary); | ||
1023 | /* | ||
1024 | * Block out ext4_truncate while we alter the tree | ||
1025 | */ | ||
1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
1027 | &count, goal, | ||
1028 | offsets + (partial - chain), partial); | ||
1029 | |||
1030 | /* | ||
1031 | * The ext4_splice_branch call will free and forget any buffers | ||
1032 | * on the new chain if there is a failure, but that risks using | ||
1033 | * up transaction credits, especially for bitmaps where the | ||
1034 | * credits cannot be returned. Can we handle this somehow? We | ||
1035 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
1036 | */ | ||
1037 | if (!err) | ||
1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
1039 | partial, indirect_blks, count); | ||
1040 | if (err) | ||
1041 | goto cleanup; | ||
1042 | |||
1043 | map->m_flags |= EXT4_MAP_NEW; | ||
1044 | |||
1045 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1046 | got_it: | ||
1047 | map->m_flags |= EXT4_MAP_MAPPED; | ||
1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
1049 | map->m_len = count; | ||
1050 | if (count > blocks_to_boundary) | ||
1051 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
1052 | err = count; | ||
1053 | /* Clean up and exit */ | ||
1054 | partial = chain + depth - 1; /* the whole chain */ | ||
1055 | cleanup: | ||
1056 | while (partial > chain) { | ||
1057 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1058 | brelse(partial->bh); | ||
1059 | partial--; | ||
1060 | } | ||
1061 | out: | ||
1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
1063 | map->m_pblk, map->m_len, err); | ||
1064 | return err; | ||
1065 | } | ||
1066 | |||
1067 | #ifdef CONFIG_QUOTA | 238 | #ifdef CONFIG_QUOTA |
1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) | 239 | qsize_t *ext4_get_reserved_space(struct inode *inode) |
1069 | { | 240 | { |
@@ -1073,33 +244,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) | |||
1073 | 244 | ||
1074 | /* | 245 | /* |
1075 | * Calculate the number of metadata blocks need to reserve | 246 | * Calculate the number of metadata blocks need to reserve |
1076 | * to allocate a new block at @lblocks for non extent file based file | ||
1077 | */ | ||
1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, | ||
1079 | sector_t lblock) | ||
1080 | { | ||
1081 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
1083 | int blk_bits; | ||
1084 | |||
1085 | if (lblock < EXT4_NDIR_BLOCKS) | ||
1086 | return 0; | ||
1087 | |||
1088 | lblock -= EXT4_NDIR_BLOCKS; | ||
1089 | |||
1090 | if (ei->i_da_metadata_calc_len && | ||
1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
1092 | ei->i_da_metadata_calc_len++; | ||
1093 | return 0; | ||
1094 | } | ||
1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
1096 | ei->i_da_metadata_calc_len = 1; | ||
1097 | blk_bits = order_base_2(lblock); | ||
1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Calculate the number of metadata blocks need to reserve | ||
1103 | * to allocate a block located at @lblock | 247 | * to allocate a block located at @lblock |
1104 | */ | 248 | */ |
1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | 249 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
@@ -1107,7 +251,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | |||
1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 251 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1108 | return ext4_ext_calc_metadata_amount(inode, lblock); | 252 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1109 | 253 | ||
1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); | 254 | return ext4_ind_calc_metadata_amount(inode, lblock); |
1111 | } | 255 | } |
1112 | 256 | ||
1113 | /* | 257 | /* |
@@ -1589,16 +733,6 @@ static int do_journal_get_write_access(handle_t *handle, | |||
1589 | return ret; | 733 | return ret; |
1590 | } | 734 | } |
1591 | 735 | ||
1592 | /* | ||
1593 | * Truncate blocks that were not used by write. We have to truncate the | ||
1594 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
1595 | */ | ||
1596 | static void ext4_truncate_failed_write(struct inode *inode) | ||
1597 | { | ||
1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1599 | ext4_truncate(inode); | ||
1600 | } | ||
1601 | |||
1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 736 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
1603 | struct buffer_head *bh_result, int create); | 737 | struct buffer_head *bh_result, int create); |
1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 738 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
@@ -1863,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1863 | if (new_i_size > inode->i_size) | 997 | if (new_i_size > inode->i_size) |
1864 | i_size_write(inode, pos+copied); | 998 | i_size_write(inode, pos+copied); |
1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 999 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1000 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1001 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1867 | ext4_update_i_disksize(inode, new_i_size); | 1002 | ext4_update_i_disksize(inode, new_i_size); |
1868 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1003 | ret2 = ext4_mark_inode_dirty(handle, inode); |
@@ -2571,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, | |||
2571 | write_end_fn); | 1706 | write_end_fn); |
2572 | if (ret == 0) | 1707 | if (ret == 0) |
2573 | ret = err; | 1708 | ret = err; |
1709 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
2574 | err = ext4_journal_stop(handle); | 1710 | err = ext4_journal_stop(handle); |
2575 | if (!ret) | 1711 | if (!ret) |
2576 | ret = err; | 1712 | ret = err; |
@@ -3450,112 +2586,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3450 | } | 2586 | } |
3451 | 2587 | ||
3452 | /* | 2588 | /* |
3453 | * O_DIRECT for ext3 (or indirect map) based files | ||
3454 | * | ||
3455 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3456 | * orphan list. So recovery will truncate it back to the original size | ||
3457 | * if the machine crashes during the write. | ||
3458 | * | ||
3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
3460 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
3461 | * VFS code falls back into buffered path in that case so we are safe. | ||
3462 | */ | ||
3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
3464 | const struct iovec *iov, loff_t offset, | ||
3465 | unsigned long nr_segs) | ||
3466 | { | ||
3467 | struct file *file = iocb->ki_filp; | ||
3468 | struct inode *inode = file->f_mapping->host; | ||
3469 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3470 | handle_t *handle; | ||
3471 | ssize_t ret; | ||
3472 | int orphan = 0; | ||
3473 | size_t count = iov_length(iov, nr_segs); | ||
3474 | int retries = 0; | ||
3475 | |||
3476 | if (rw == WRITE) { | ||
3477 | loff_t final_size = offset + count; | ||
3478 | |||
3479 | if (final_size > inode->i_size) { | ||
3480 | /* Credits for sb + inode write */ | ||
3481 | handle = ext4_journal_start(inode, 2); | ||
3482 | if (IS_ERR(handle)) { | ||
3483 | ret = PTR_ERR(handle); | ||
3484 | goto out; | ||
3485 | } | ||
3486 | ret = ext4_orphan_add(handle, inode); | ||
3487 | if (ret) { | ||
3488 | ext4_journal_stop(handle); | ||
3489 | goto out; | ||
3490 | } | ||
3491 | orphan = 1; | ||
3492 | ei->i_disksize = inode->i_size; | ||
3493 | ext4_journal_stop(handle); | ||
3494 | } | ||
3495 | } | ||
3496 | |||
3497 | retry: | ||
3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
3499 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
3500 | inode->i_sb->s_bdev, iov, | ||
3501 | offset, nr_segs, | ||
3502 | ext4_get_block, NULL, NULL, 0); | ||
3503 | else { | ||
3504 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
3505 | offset, nr_segs, ext4_get_block); | ||
3506 | |||
3507 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
3508 | loff_t isize = i_size_read(inode); | ||
3509 | loff_t end = offset + iov_length(iov, nr_segs); | ||
3510 | |||
3511 | if (end > isize) | ||
3512 | ext4_truncate_failed_write(inode); | ||
3513 | } | ||
3514 | } | ||
3515 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3516 | goto retry; | ||
3517 | |||
3518 | if (orphan) { | ||
3519 | int err; | ||
3520 | |||
3521 | /* Credits for sb + inode write */ | ||
3522 | handle = ext4_journal_start(inode, 2); | ||
3523 | if (IS_ERR(handle)) { | ||
3524 | /* This is really bad luck. We've written the data | ||
3525 | * but cannot extend i_size. Bail out and pretend | ||
3526 | * the write failed... */ | ||
3527 | ret = PTR_ERR(handle); | ||
3528 | if (inode->i_nlink) | ||
3529 | ext4_orphan_del(NULL, inode); | ||
3530 | |||
3531 | goto out; | ||
3532 | } | ||
3533 | if (inode->i_nlink) | ||
3534 | ext4_orphan_del(handle, inode); | ||
3535 | if (ret > 0) { | ||
3536 | loff_t end = offset + ret; | ||
3537 | if (end > inode->i_size) { | ||
3538 | ei->i_disksize = end; | ||
3539 | i_size_write(inode, end); | ||
3540 | /* | ||
3541 | * We're going to return a positive `ret' | ||
3542 | * here due to non-zero-length I/O, so there's | ||
3543 | * no way of reporting error returns from | ||
3544 | * ext4_mark_inode_dirty() to userspace. So | ||
3545 | * ignore it. | ||
3546 | */ | ||
3547 | ext4_mark_inode_dirty(handle, inode); | ||
3548 | } | ||
3549 | } | ||
3550 | err = ext4_journal_stop(handle); | ||
3551 | if (ret == 0) | ||
3552 | ret = err; | ||
3553 | } | ||
3554 | out: | ||
3555 | return ret; | ||
3556 | } | ||
3557 | |||
3558 | /* | ||
3559 | * ext4_get_block used when preparing for a DIO write or buffer write. | 2589 | * ext4_get_block used when preparing for a DIO write or buffer write. |
3560 | * We allocate an uinitialized extent if blocks haven't been allocated. | 2590 | * We allocate an uinitialized extent if blocks haven't been allocated. |
3561 | * The extent will be converted to initialized after the IO is complete. | 2591 | * The extent will be converted to initialized after the IO is complete. |
@@ -4033,383 +3063,6 @@ unlock: | |||
4033 | return err; | 3063 | return err; |
4034 | } | 3064 | } |
4035 | 3065 | ||
4036 | /* | ||
4037 | * Probably it should be a library function... search for first non-zero word | ||
4038 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
4039 | * Linus? | ||
4040 | */ | ||
4041 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
4042 | { | ||
4043 | while (p < q) | ||
4044 | if (*p++) | ||
4045 | return 0; | ||
4046 | return 1; | ||
4047 | } | ||
4048 | |||
4049 | /** | ||
4050 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
4051 | * @inode: inode in question | ||
4052 | * @depth: depth of the affected branch | ||
4053 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
4054 | * @chain: place to store the pointers to partial indirect blocks | ||
4055 | * @top: place to the (detached) top of branch | ||
4056 | * | ||
4057 | * This is a helper function used by ext4_truncate(). | ||
4058 | * | ||
4059 | * When we do truncate() we may have to clean the ends of several | ||
4060 | * indirect blocks but leave the blocks themselves alive. Block is | ||
4061 | * partially truncated if some data below the new i_size is referred | ||
4062 | * from it (and it is on the path to the first completely truncated | ||
4063 | * data block, indeed). We have to free the top of that path along | ||
4064 | * with everything to the right of the path. Since no allocation | ||
4065 | * past the truncation point is possible until ext4_truncate() | ||
4066 | * finishes, we may safely do the latter, but top of branch may | ||
4067 | * require special attention - pageout below the truncation point | ||
4068 | * might try to populate it. | ||
4069 | * | ||
4070 | * We atomically detach the top of branch from the tree, store the | ||
4071 | * block number of its root in *@top, pointers to buffer_heads of | ||
4072 | * partially truncated blocks - in @chain[].bh and pointers to | ||
4073 | * their last elements that should not be removed - in | ||
4074 | * @chain[].p. Return value is the pointer to last filled element | ||
4075 | * of @chain. | ||
4076 | * | ||
4077 | * The work left to caller to do the actual freeing of subtrees: | ||
4078 | * a) free the subtree starting from *@top | ||
4079 | * b) free the subtrees whose roots are stored in | ||
4080 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
4081 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
4082 | * (no partially truncated stuff there). */ | ||
4083 | |||
4084 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
4085 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
4086 | __le32 *top) | ||
4087 | { | ||
4088 | Indirect *partial, *p; | ||
4089 | int k, err; | ||
4090 | |||
4091 | *top = 0; | ||
4092 | /* Make k index the deepest non-null offset + 1 */ | ||
4093 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
4094 | ; | ||
4095 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
4096 | /* Writer: pointers */ | ||
4097 | if (!partial) | ||
4098 | partial = chain + k-1; | ||
4099 | /* | ||
4100 | * If the branch acquired continuation since we've looked at it - | ||
4101 | * fine, it should all survive and (new) top doesn't belong to us. | ||
4102 | */ | ||
4103 | if (!partial->key && *partial->p) | ||
4104 | /* Writer: end */ | ||
4105 | goto no_top; | ||
4106 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
4107 | ; | ||
4108 | /* | ||
4109 | * OK, we've found the last block that must survive. The rest of our | ||
4110 | * branch should be detached before unlocking. However, if that rest | ||
4111 | * of branch is all ours and does not grow immediately from the inode | ||
4112 | * it's easier to cheat and just decrement partial->p. | ||
4113 | */ | ||
4114 | if (p == chain + k - 1 && p > chain) { | ||
4115 | p->p--; | ||
4116 | } else { | ||
4117 | *top = *p->p; | ||
4118 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
4119 | #if 0 | ||
4120 | *p->p = 0; | ||
4121 | #endif | ||
4122 | } | ||
4123 | /* Writer: end */ | ||
4124 | |||
4125 | while (partial > p) { | ||
4126 | brelse(partial->bh); | ||
4127 | partial--; | ||
4128 | } | ||
4129 | no_top: | ||
4130 | return partial; | ||
4131 | } | ||
4132 | |||
4133 | /* | ||
4134 | * Zero a number of block pointers in either an inode or an indirect block. | ||
4135 | * If we restart the transaction we must again get write access to the | ||
4136 | * indirect block for further modification. | ||
4137 | * | ||
4138 | * We release `count' blocks on disk, but (last - first) may be greater | ||
4139 | * than `count' because there can be holes in there. | ||
4140 | * | ||
4141 | * Return 0 on success, 1 on invalid block range | ||
4142 | * and < 0 on fatal error. | ||
4143 | */ | ||
4144 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
4145 | struct buffer_head *bh, | ||
4146 | ext4_fsblk_t block_to_free, | ||
4147 | unsigned long count, __le32 *first, | ||
4148 | __le32 *last) | ||
4149 | { | ||
4150 | __le32 *p; | ||
4151 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
4152 | int err; | ||
4153 | |||
4154 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
4155 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4156 | |||
4157 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
4158 | count)) { | ||
4159 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
4160 | "blocks %llu len %lu", | ||
4161 | (unsigned long long) block_to_free, count); | ||
4162 | return 1; | ||
4163 | } | ||
4164 | |||
4165 | if (try_to_extend_transaction(handle, inode)) { | ||
4166 | if (bh) { | ||
4167 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
4168 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
4169 | if (unlikely(err)) | ||
4170 | goto out_err; | ||
4171 | } | ||
4172 | err = ext4_mark_inode_dirty(handle, inode); | ||
4173 | if (unlikely(err)) | ||
4174 | goto out_err; | ||
4175 | err = ext4_truncate_restart_trans(handle, inode, | ||
4176 | blocks_for_truncate(inode)); | ||
4177 | if (unlikely(err)) | ||
4178 | goto out_err; | ||
4179 | if (bh) { | ||
4180 | BUFFER_TRACE(bh, "retaking write access"); | ||
4181 | err = ext4_journal_get_write_access(handle, bh); | ||
4182 | if (unlikely(err)) | ||
4183 | goto out_err; | ||
4184 | } | ||
4185 | } | ||
4186 | |||
4187 | for (p = first; p < last; p++) | ||
4188 | *p = 0; | ||
4189 | |||
4190 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
4191 | return 0; | ||
4192 | out_err: | ||
4193 | ext4_std_error(inode->i_sb, err); | ||
4194 | return err; | ||
4195 | } | ||
4196 | |||
4197 | /** | ||
4198 | * ext4_free_data - free a list of data blocks | ||
4199 | * @handle: handle for this transaction | ||
4200 | * @inode: inode we are dealing with | ||
4201 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
4202 | * @first: array of block numbers | ||
4203 | * @last: points immediately past the end of array | ||
4204 | * | ||
4205 | * We are freeing all blocks referred from that array (numbers are stored as | ||
4206 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
4207 | * | ||
4208 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
4209 | * blocks are contiguous then releasing them at one time will only affect one | ||
4210 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
4211 | * actually use a lot of journal space. | ||
4212 | * | ||
4213 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
4214 | * block pointers. | ||
4215 | */ | ||
4216 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
4217 | struct buffer_head *this_bh, | ||
4218 | __le32 *first, __le32 *last) | ||
4219 | { | ||
4220 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
4221 | unsigned long count = 0; /* Number of blocks in the run */ | ||
4222 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
4223 | corresponding to | ||
4224 | block_to_free */ | ||
4225 | ext4_fsblk_t nr; /* Current block # */ | ||
4226 | __le32 *p; /* Pointer into inode/ind | ||
4227 | for current block */ | ||
4228 | int err = 0; | ||
4229 | |||
4230 | if (this_bh) { /* For indirect block */ | ||
4231 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
4232 | err = ext4_journal_get_write_access(handle, this_bh); | ||
4233 | /* Important: if we can't update the indirect pointers | ||
4234 | * to the blocks, we can't free them. */ | ||
4235 | if (err) | ||
4236 | return; | ||
4237 | } | ||
4238 | |||
4239 | for (p = first; p < last; p++) { | ||
4240 | nr = le32_to_cpu(*p); | ||
4241 | if (nr) { | ||
4242 | /* accumulate blocks to free if they're contiguous */ | ||
4243 | if (count == 0) { | ||
4244 | block_to_free = nr; | ||
4245 | block_to_free_p = p; | ||
4246 | count = 1; | ||
4247 | } else if (nr == block_to_free + count) { | ||
4248 | count++; | ||
4249 | } else { | ||
4250 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
4251 | block_to_free, count, | ||
4252 | block_to_free_p, p); | ||
4253 | if (err) | ||
4254 | break; | ||
4255 | block_to_free = nr; | ||
4256 | block_to_free_p = p; | ||
4257 | count = 1; | ||
4258 | } | ||
4259 | } | ||
4260 | } | ||
4261 | |||
4262 | if (!err && count > 0) | ||
4263 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
4264 | count, block_to_free_p, p); | ||
4265 | if (err < 0) | ||
4266 | /* fatal error */ | ||
4267 | return; | ||
4268 | |||
4269 | if (this_bh) { | ||
4270 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
4271 | |||
4272 | /* | ||
4273 | * The buffer head should have an attached journal head at this | ||
4274 | * point. However, if the data is corrupted and an indirect | ||
4275 | * block pointed to itself, it would have been detached when | ||
4276 | * the block was cleared. Check for this instead of OOPSing. | ||
4277 | */ | ||
4278 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
4279 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
4280 | else | ||
4281 | EXT4_ERROR_INODE(inode, | ||
4282 | "circular indirect block detected at " | ||
4283 | "block %llu", | ||
4284 | (unsigned long long) this_bh->b_blocknr); | ||
4285 | } | ||
4286 | } | ||
4287 | |||
4288 | /** | ||
4289 | * ext4_free_branches - free an array of branches | ||
4290 | * @handle: JBD handle for this transaction | ||
4291 | * @inode: inode we are dealing with | ||
4292 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
4293 | * @first: array of block numbers | ||
4294 | * @last: pointer immediately past the end of array | ||
4295 | * @depth: depth of the branches to free | ||
4296 | * | ||
4297 | * We are freeing all blocks referred from these branches (numbers are | ||
4298 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
4299 | * appropriately. | ||
4300 | */ | ||
4301 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
4302 | struct buffer_head *parent_bh, | ||
4303 | __le32 *first, __le32 *last, int depth) | ||
4304 | { | ||
4305 | ext4_fsblk_t nr; | ||
4306 | __le32 *p; | ||
4307 | |||
4308 | if (ext4_handle_is_aborted(handle)) | ||
4309 | return; | ||
4310 | |||
4311 | if (depth--) { | ||
4312 | struct buffer_head *bh; | ||
4313 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4314 | p = last; | ||
4315 | while (--p >= first) { | ||
4316 | nr = le32_to_cpu(*p); | ||
4317 | if (!nr) | ||
4318 | continue; /* A hole */ | ||
4319 | |||
4320 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
4321 | nr, 1)) { | ||
4322 | EXT4_ERROR_INODE(inode, | ||
4323 | "invalid indirect mapped " | ||
4324 | "block %lu (level %d)", | ||
4325 | (unsigned long) nr, depth); | ||
4326 | break; | ||
4327 | } | ||
4328 | |||
4329 | /* Go read the buffer for the next level down */ | ||
4330 | bh = sb_bread(inode->i_sb, nr); | ||
4331 | |||
4332 | /* | ||
4333 | * A read failure? Report error and clear slot | ||
4334 | * (should be rare). | ||
4335 | */ | ||
4336 | if (!bh) { | ||
4337 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
4338 | "Read failure"); | ||
4339 | continue; | ||
4340 | } | ||
4341 | |||
4342 | /* This zaps the entire block. Bottom up. */ | ||
4343 | BUFFER_TRACE(bh, "free child branches"); | ||
4344 | ext4_free_branches(handle, inode, bh, | ||
4345 | (__le32 *) bh->b_data, | ||
4346 | (__le32 *) bh->b_data + addr_per_block, | ||
4347 | depth); | ||
4348 | brelse(bh); | ||
4349 | |||
4350 | /* | ||
4351 | * Everything below this this pointer has been | ||
4352 | * released. Now let this top-of-subtree go. | ||
4353 | * | ||
4354 | * We want the freeing of this indirect block to be | ||
4355 | * atomic in the journal with the updating of the | ||
4356 | * bitmap block which owns it. So make some room in | ||
4357 | * the journal. | ||
4358 | * | ||
4359 | * We zero the parent pointer *after* freeing its | ||
4360 | * pointee in the bitmaps, so if extend_transaction() | ||
4361 | * for some reason fails to put the bitmap changes and | ||
4362 | * the release into the same transaction, recovery | ||
4363 | * will merely complain about releasing a free block, | ||
4364 | * rather than leaking blocks. | ||
4365 | */ | ||
4366 | if (ext4_handle_is_aborted(handle)) | ||
4367 | return; | ||
4368 | if (try_to_extend_transaction(handle, inode)) { | ||
4369 | ext4_mark_inode_dirty(handle, inode); | ||
4370 | ext4_truncate_restart_trans(handle, inode, | ||
4371 | blocks_for_truncate(inode)); | ||
4372 | } | ||
4373 | |||
4374 | /* | ||
4375 | * The forget flag here is critical because if | ||
4376 | * we are journaling (and not doing data | ||
4377 | * journaling), we have to make sure a revoke | ||
4378 | * record is written to prevent the journal | ||
4379 | * replay from overwriting the (former) | ||
4380 | * indirect block if it gets reallocated as a | ||
4381 | * data block. This must happen in the same | ||
4382 | * transaction where the data blocks are | ||
4383 | * actually freed. | ||
4384 | */ | ||
4385 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
4386 | EXT4_FREE_BLOCKS_METADATA| | ||
4387 | EXT4_FREE_BLOCKS_FORGET); | ||
4388 | |||
4389 | if (parent_bh) { | ||
4390 | /* | ||
4391 | * The block which we have just freed is | ||
4392 | * pointed to by an indirect block: journal it | ||
4393 | */ | ||
4394 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
4395 | if (!ext4_journal_get_write_access(handle, | ||
4396 | parent_bh)){ | ||
4397 | *p = 0; | ||
4398 | BUFFER_TRACE(parent_bh, | ||
4399 | "call ext4_handle_dirty_metadata"); | ||
4400 | ext4_handle_dirty_metadata(handle, | ||
4401 | inode, | ||
4402 | parent_bh); | ||
4403 | } | ||
4404 | } | ||
4405 | } | ||
4406 | } else { | ||
4407 | /* We have reached the bottom of the tree. */ | ||
4408 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
4409 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
4410 | } | ||
4411 | } | ||
4412 | |||
4413 | int ext4_can_truncate(struct inode *inode) | 3066 | int ext4_can_truncate(struct inode *inode) |
4414 | { | 3067 | { |
4415 | if (S_ISREG(inode->i_mode)) | 3068 | if (S_ISREG(inode->i_mode)) |
@@ -4476,19 +3129,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4476 | */ | 3129 | */ |
4477 | void ext4_truncate(struct inode *inode) | 3130 | void ext4_truncate(struct inode *inode) |
4478 | { | 3131 | { |
4479 | handle_t *handle; | ||
4480 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4481 | __le32 *i_data = ei->i_data; | ||
4482 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4483 | struct address_space *mapping = inode->i_mapping; | ||
4484 | ext4_lblk_t offsets[4]; | ||
4485 | Indirect chain[4]; | ||
4486 | Indirect *partial; | ||
4487 | __le32 nr = 0; | ||
4488 | int n = 0; | ||
4489 | ext4_lblk_t last_block, max_block; | ||
4490 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
4491 | |||
4492 | trace_ext4_truncate_enter(inode); | 3132 | trace_ext4_truncate_enter(inode); |
4493 | 3133 | ||
4494 | if (!ext4_can_truncate(inode)) | 3134 | if (!ext4_can_truncate(inode)) |
@@ -4499,149 +3139,11 @@ void ext4_truncate(struct inode *inode) | |||
4499 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | 3139 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4500 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | 3140 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); |
4501 | 3141 | ||
4502 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 3142 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
4503 | ext4_ext_truncate(inode); | 3143 | ext4_ext_truncate(inode); |
4504 | trace_ext4_truncate_exit(inode); | 3144 | else |
4505 | return; | 3145 | ext4_ind_truncate(inode); |
4506 | } | ||
4507 | |||
4508 | handle = start_transaction(inode); | ||
4509 | if (IS_ERR(handle)) | ||
4510 | return; /* AKPM: return what? */ | ||
4511 | |||
4512 | last_block = (inode->i_size + blocksize-1) | ||
4513 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4514 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
4515 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4516 | |||
4517 | if (inode->i_size & (blocksize - 1)) | ||
4518 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
4519 | goto out_stop; | ||
4520 | |||
4521 | if (last_block != max_block) { | ||
4522 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
4523 | if (n == 0) | ||
4524 | goto out_stop; /* error */ | ||
4525 | } | ||
4526 | |||
4527 | /* | ||
4528 | * OK. This truncate is going to happen. We add the inode to the | ||
4529 | * orphan list, so that if this truncate spans multiple transactions, | ||
4530 | * and we crash, we will resume the truncate when the filesystem | ||
4531 | * recovers. It also marks the inode dirty, to catch the new size. | ||
4532 | * | ||
4533 | * Implication: the file must always be in a sane, consistent | ||
4534 | * truncatable state while each transaction commits. | ||
4535 | */ | ||
4536 | if (ext4_orphan_add(handle, inode)) | ||
4537 | goto out_stop; | ||
4538 | |||
4539 | /* | ||
4540 | * From here we block out all ext4_get_block() callers who want to | ||
4541 | * modify the block allocation tree. | ||
4542 | */ | ||
4543 | down_write(&ei->i_data_sem); | ||
4544 | |||
4545 | ext4_discard_preallocations(inode); | ||
4546 | |||
4547 | /* | ||
4548 | * The orphan list entry will now protect us from any crash which | ||
4549 | * occurs before the truncate completes, so it is now safe to propagate | ||
4550 | * the new, shorter inode size (held for now in i_size) into the | ||
4551 | * on-disk inode. We do this via i_disksize, which is the value which | ||
4552 | * ext4 *really* writes onto the disk inode. | ||
4553 | */ | ||
4554 | ei->i_disksize = inode->i_size; | ||
4555 | |||
4556 | if (last_block == max_block) { | ||
4557 | /* | ||
4558 | * It is unnecessary to free any data blocks if last_block is | ||
4559 | * equal to the indirect block limit. | ||
4560 | */ | ||
4561 | goto out_unlock; | ||
4562 | } else if (n == 1) { /* direct blocks */ | ||
4563 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
4564 | i_data + EXT4_NDIR_BLOCKS); | ||
4565 | goto do_indirects; | ||
4566 | } | ||
4567 | |||
4568 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
4569 | /* Kill the top of shared branch (not detached) */ | ||
4570 | if (nr) { | ||
4571 | if (partial == chain) { | ||
4572 | /* Shared branch grows from the inode */ | ||
4573 | ext4_free_branches(handle, inode, NULL, | ||
4574 | &nr, &nr+1, (chain+n-1) - partial); | ||
4575 | *partial->p = 0; | ||
4576 | /* | ||
4577 | * We mark the inode dirty prior to restart, | ||
4578 | * and prior to stop. No need for it here. | ||
4579 | */ | ||
4580 | } else { | ||
4581 | /* Shared branch grows from an indirect block */ | ||
4582 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
4583 | ext4_free_branches(handle, inode, partial->bh, | ||
4584 | partial->p, | ||
4585 | partial->p+1, (chain+n-1) - partial); | ||
4586 | } | ||
4587 | } | ||
4588 | /* Clear the ends of indirect blocks on the shared branch */ | ||
4589 | while (partial > chain) { | ||
4590 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
4591 | (__le32*)partial->bh->b_data+addr_per_block, | ||
4592 | (chain+n-1) - partial); | ||
4593 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
4594 | brelse(partial->bh); | ||
4595 | partial--; | ||
4596 | } | ||
4597 | do_indirects: | ||
4598 | /* Kill the remaining (whole) subtrees */ | ||
4599 | switch (offsets[0]) { | ||
4600 | default: | ||
4601 | nr = i_data[EXT4_IND_BLOCK]; | ||
4602 | if (nr) { | ||
4603 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
4604 | i_data[EXT4_IND_BLOCK] = 0; | ||
4605 | } | ||
4606 | case EXT4_IND_BLOCK: | ||
4607 | nr = i_data[EXT4_DIND_BLOCK]; | ||
4608 | if (nr) { | ||
4609 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
4610 | i_data[EXT4_DIND_BLOCK] = 0; | ||
4611 | } | ||
4612 | case EXT4_DIND_BLOCK: | ||
4613 | nr = i_data[EXT4_TIND_BLOCK]; | ||
4614 | if (nr) { | ||
4615 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
4616 | i_data[EXT4_TIND_BLOCK] = 0; | ||
4617 | } | ||
4618 | case EXT4_TIND_BLOCK: | ||
4619 | ; | ||
4620 | } | ||
4621 | |||
4622 | out_unlock: | ||
4623 | up_write(&ei->i_data_sem); | ||
4624 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4625 | ext4_mark_inode_dirty(handle, inode); | ||
4626 | |||
4627 | /* | ||
4628 | * In a multi-transaction truncate, we only make the final transaction | ||
4629 | * synchronous | ||
4630 | */ | ||
4631 | if (IS_SYNC(inode)) | ||
4632 | ext4_handle_sync(handle); | ||
4633 | out_stop: | ||
4634 | /* | ||
4635 | * If this was a simple ftruncate(), and the file will remain alive | ||
4636 | * then we need to clear up the orphan record which we created above. | ||
4637 | * However, if this was a real unlink then we were called by | ||
4638 | * ext4_delete_inode(), and we allow that function to clean up the | ||
4639 | * orphan info for us. | ||
4640 | */ | ||
4641 | if (inode->i_nlink) | ||
4642 | ext4_orphan_del(handle, inode); | ||
4643 | 3146 | ||
4644 | ext4_journal_stop(handle); | ||
4645 | trace_ext4_truncate_exit(inode); | 3147 | trace_ext4_truncate_exit(inode); |
4646 | } | 3148 | } |
4647 | 3149 | ||
@@ -5012,7 +3514,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
5012 | (S_ISLNK(inode->i_mode) && | 3514 | (S_ISLNK(inode->i_mode) && |
5013 | !ext4_inode_is_fast_symlink(inode))) { | 3515 | !ext4_inode_is_fast_symlink(inode))) { |
5014 | /* Validate block references which are part of inode */ | 3516 | /* Validate block references which are part of inode */ |
5015 | ret = ext4_check_inode_blockref(inode); | 3517 | ret = ext4_ind_check_inode(inode); |
5016 | } | 3518 | } |
5017 | if (ret) | 3519 | if (ret) |
5018 | goto bad_inode; | 3520 | goto bad_inode; |
@@ -5459,34 +3961,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
5459 | return 0; | 3961 | return 0; |
5460 | } | 3962 | } |
5461 | 3963 | ||
5462 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | ||
5463 | int chunk) | ||
5464 | { | ||
5465 | int indirects; | ||
5466 | |||
5467 | /* if nrblocks are contiguous */ | ||
5468 | if (chunk) { | ||
5469 | /* | ||
5470 | * With N contiguous data blocks, we need at most | ||
5471 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
5472 | * 2 dindirect blocks, and 1 tindirect block | ||
5473 | */ | ||
5474 | return DIV_ROUND_UP(nrblocks, | ||
5475 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
5476 | } | ||
5477 | /* | ||
5478 | * if nrblocks are not contiguous, worse case, each block touch | ||
5479 | * a indirect block, and each indirect block touch a double indirect | ||
5480 | * block, plus a triple indirect block | ||
5481 | */ | ||
5482 | indirects = nrblocks * 2 + 1; | ||
5483 | return indirects; | ||
5484 | } | ||
5485 | |||
5486 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 3964 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5487 | { | 3965 | { |
5488 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 3966 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5489 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 3967 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); |
5490 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 3968 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5491 | } | 3969 | } |
5492 | 3970 | ||
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e773f..f18bfe37aff8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -202,8 +202,9 @@ setversion_out: | |||
202 | struct super_block *sb = inode->i_sb; | 202 | struct super_block *sb = inode->i_sb; |
203 | int err, err2=0; | 203 | int err, err2=0; |
204 | 204 | ||
205 | if (!capable(CAP_SYS_RESOURCE)) | 205 | err = ext4_resize_begin(sb); |
206 | return -EPERM; | 206 | if (err) |
207 | return err; | ||
207 | 208 | ||
208 | if (get_user(n_blocks_count, (__u32 __user *)arg)) | 209 | if (get_user(n_blocks_count, (__u32 __user *)arg)) |
209 | return -EFAULT; | 210 | return -EFAULT; |
@@ -221,6 +222,7 @@ setversion_out: | |||
221 | if (err == 0) | 222 | if (err == 0) |
222 | err = err2; | 223 | err = err2; |
223 | mnt_drop_write(filp->f_path.mnt); | 224 | mnt_drop_write(filp->f_path.mnt); |
225 | ext4_resize_end(sb); | ||
224 | 226 | ||
225 | return err; | 227 | return err; |
226 | } | 228 | } |
@@ -271,8 +273,9 @@ mext_out: | |||
271 | struct super_block *sb = inode->i_sb; | 273 | struct super_block *sb = inode->i_sb; |
272 | int err, err2=0; | 274 | int err, err2=0; |
273 | 275 | ||
274 | if (!capable(CAP_SYS_RESOURCE)) | 276 | err = ext4_resize_begin(sb); |
275 | return -EPERM; | 277 | if (err) |
278 | return err; | ||
276 | 279 | ||
277 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, | 280 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, |
278 | sizeof(input))) | 281 | sizeof(input))) |
@@ -291,6 +294,7 @@ mext_out: | |||
291 | if (err == 0) | 294 | if (err == 0) |
292 | err = err2; | 295 | err = err2; |
293 | mnt_drop_write(filp->f_path.mnt); | 296 | mnt_drop_write(filp->f_path.mnt); |
297 | ext4_resize_end(sb); | ||
294 | 298 | ||
295 | return err; | 299 | return err; |
296 | } | 300 | } |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d56850..17a5a57c415a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -75,8 +75,8 @@ | |||
75 | * | 75 | * |
76 | * The inode preallocation space is used looking at the _logical_ start | 76 | * The inode preallocation space is used looking at the _logical_ start |
77 | * block. If only the logical file block falls within the range of prealloc | 77 | * block. If only the logical file block falls within the range of prealloc |
78 | * space we will consume the particular prealloc space. This make sure that | 78 | * space we will consume the particular prealloc space. This makes sure that |
79 | * that the we have contiguous physical blocks representing the file blocks | 79 | * we have contiguous physical blocks representing the file blocks |
80 | * | 80 | * |
81 | * The important thing to be noted in case of inode prealloc space is that | 81 | * The important thing to be noted in case of inode prealloc space is that |
82 | * we don't modify the values associated to inode prealloc space except | 82 | * we don't modify the values associated to inode prealloc space except |
@@ -84,7 +84,7 @@ | |||
84 | * | 84 | * |
85 | * If we are not able to find blocks in the inode prealloc space and if we | 85 | * If we are not able to find blocks in the inode prealloc space and if we |
86 | * have the group allocation flag set then we look at the locality group | 86 | * have the group allocation flag set then we look at the locality group |
87 | * prealloc space. These are per CPU prealloc list repreasented as | 87 | * prealloc space. These are per CPU prealloc list represented as |
88 | * | 88 | * |
89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] | 89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] |
90 | * | 90 | * |
@@ -128,12 +128,13 @@ | |||
128 | * we are doing a group prealloc we try to normalize the request to | 128 | * we are doing a group prealloc we try to normalize the request to |
129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is | 129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is |
130 | * 512 blocks. This can be tuned via | 130 | * 512 blocks. This can be tuned via |
131 | * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in | 131 | * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in |
132 | * terms of number of blocks. If we have mounted the file system with -O | 132 | * terms of number of blocks. If we have mounted the file system with -O |
133 | * stripe=<value> option the group prealloc request is normalized to the | 133 | * stripe=<value> option the group prealloc request is normalized to the |
134 | * stripe value (sbi->s_stripe) | 134 | * the smallest multiple of the stripe value (sbi->s_stripe) which is |
135 | * greater than the default mb_group_prealloc. | ||
135 | * | 136 | * |
136 | * The regular allocator(using the buddy cache) supports few tunables. | 137 | * The regular allocator (using the buddy cache) supports a few tunables. |
137 | * | 138 | * |
138 | * /sys/fs/ext4/<partition>/mb_min_to_scan | 139 | * /sys/fs/ext4/<partition>/mb_min_to_scan |
139 | * /sys/fs/ext4/<partition>/mb_max_to_scan | 140 | * /sys/fs/ext4/<partition>/mb_max_to_scan |
@@ -152,7 +153,7 @@ | |||
152 | * best extent in the found extents. Searching for the blocks starts with | 153 | * best extent in the found extents. Searching for the blocks starts with |
153 | * the group specified as the goal value in allocation context via | 154 | * the group specified as the goal value in allocation context via |
154 | * ac_g_ex. Each group is first checked based on the criteria whether it | 155 | * ac_g_ex. Each group is first checked based on the criteria whether it |
155 | * can used for allocation. ext4_mb_good_group explains how the groups are | 156 | * can be used for allocation. ext4_mb_good_group explains how the groups are |
156 | * checked. | 157 | * checked. |
157 | * | 158 | * |
158 | * Both the prealloc space are getting populated as above. So for the first | 159 | * Both the prealloc space are getting populated as above. So for the first |
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | |||
492 | b2 = (unsigned char *) bitmap; | 493 | b2 = (unsigned char *) bitmap; |
493 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | 494 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { |
494 | if (b1[i] != b2[i]) { | 495 | if (b1[i] != b2[i]) { |
495 | printk(KERN_ERR "corruption in group %u " | 496 | ext4_msg(e4b->bd_sb, KERN_ERR, |
496 | "at byte %u(%u): %x in copy != %x " | 497 | "corruption in group %u " |
497 | "on disk/prealloc\n", | 498 | "at byte %u(%u): %x in copy != %x " |
498 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | 499 | "on disk/prealloc", |
500 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | ||
499 | BUG(); | 501 | BUG(); |
500 | } | 502 | } |
501 | } | 503 | } |
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1125 | grp = ext4_get_group_info(sb, group); | 1127 | grp = ext4_get_group_info(sb, group); |
1126 | 1128 | ||
1127 | e4b->bd_blkbits = sb->s_blocksize_bits; | 1129 | e4b->bd_blkbits = sb->s_blocksize_bits; |
1128 | e4b->bd_info = ext4_get_group_info(sb, group); | 1130 | e4b->bd_info = grp; |
1129 | e4b->bd_sb = sb; | 1131 | e4b->bd_sb = sb; |
1130 | e4b->bd_group = group; | 1132 | e4b->bd_group = group; |
1131 | e4b->bd_buddy_page = NULL; | 1133 | e4b->bd_buddy_page = NULL; |
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len) | |||
1281 | } | 1283 | } |
1282 | } | 1284 | } |
1283 | 1285 | ||
1284 | static void mb_set_bits(void *bm, int cur, int len) | 1286 | void ext4_set_bits(void *bm, int cur, int len) |
1285 | { | 1287 | { |
1286 | __u32 *addr; | 1288 | __u32 *addr; |
1287 | 1289 | ||
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | |||
1510 | } | 1512 | } |
1511 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); | 1513 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); |
1512 | 1514 | ||
1513 | mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | 1515 | ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); |
1514 | mb_check_buddy(e4b); | 1516 | mb_check_buddy(e4b); |
1515 | 1517 | ||
1516 | return ret; | 1518 | return ret; |
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2223 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2225 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2224 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | 2226 | meta_group_info = kmalloc(metalen, GFP_KERNEL); |
2225 | if (meta_group_info == NULL) { | 2227 | if (meta_group_info == NULL) { |
2226 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | 2228 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " |
2227 | "buddy group\n"); | 2229 | "for a buddy group"); |
2228 | goto exit_meta_group_info; | 2230 | goto exit_meta_group_info; |
2229 | } | 2231 | } |
2230 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = | 2232 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = |
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2237 | 2239 | ||
2238 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); | 2240 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); |
2239 | if (meta_group_info[i] == NULL) { | 2241 | if (meta_group_info[i] == NULL) { |
2240 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | 2242 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); |
2241 | goto exit_group_info; | 2243 | goto exit_group_info; |
2242 | } | 2244 | } |
2243 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); | 2245 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); |
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2279 | 2281 | ||
2280 | exit_group_info: | 2282 | exit_group_info: |
2281 | /* If a meta_group_info table has been allocated, release it now */ | 2283 | /* If a meta_group_info table has been allocated, release it now */ |
2282 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) | 2284 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { |
2283 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); | 2285 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); |
2286 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; | ||
2287 | } | ||
2284 | exit_meta_group_info: | 2288 | exit_meta_group_info: |
2285 | return -ENOMEM; | 2289 | return -ENOMEM; |
2286 | } /* ext4_mb_add_groupinfo */ | 2290 | } /* ext4_mb_add_groupinfo */ |
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb) | |||
2328 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | 2332 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte |
2329 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | 2333 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. |
2330 | * So a two level scheme suffices for now. */ | 2334 | * So a two level scheme suffices for now. */ |
2331 | sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); | 2335 | sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); |
2332 | if (sbi->s_group_info == NULL) { | 2336 | if (sbi->s_group_info == NULL) { |
2333 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | 2337 | ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); |
2334 | return -ENOMEM; | 2338 | return -ENOMEM; |
2335 | } | 2339 | } |
2336 | sbi->s_buddy_cache = new_inode(sb); | 2340 | sbi->s_buddy_cache = new_inode(sb); |
2337 | if (sbi->s_buddy_cache == NULL) { | 2341 | if (sbi->s_buddy_cache == NULL) { |
2338 | printk(KERN_ERR "EXT4-fs: can't get new inode\n"); | 2342 | ext4_msg(sb, KERN_ERR, "can't get new inode"); |
2339 | goto err_freesgi; | 2343 | goto err_freesgi; |
2340 | } | 2344 | } |
2341 | sbi->s_buddy_cache->i_ino = get_next_ino(); | 2345 | /* To avoid potentially colliding with an valid on-disk inode number, |
2346 | * use EXT4_BAD_INO for the buddy cache inode number. This inode is | ||
2347 | * not in the inode hash, so it should never be found by iget(), but | ||
2348 | * this will avoid confusion if it ever shows up during debugging. */ | ||
2349 | sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; | ||
2342 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | 2350 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; |
2343 | for (i = 0; i < ngroups; i++) { | 2351 | for (i = 0; i < ngroups; i++) { |
2344 | desc = ext4_get_group_desc(sb, i, NULL); | 2352 | desc = ext4_get_group_desc(sb, i, NULL); |
2345 | if (desc == NULL) { | 2353 | if (desc == NULL) { |
2346 | printk(KERN_ERR | 2354 | ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); |
2347 | "EXT4-fs: can't read descriptor %u\n", i); | ||
2348 | goto err_freebuddy; | 2355 | goto err_freebuddy; |
2349 | } | 2356 | } |
2350 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) | 2357 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
@@ -2362,7 +2369,7 @@ err_freebuddy: | |||
2362 | kfree(sbi->s_group_info[i]); | 2369 | kfree(sbi->s_group_info[i]); |
2363 | iput(sbi->s_buddy_cache); | 2370 | iput(sbi->s_buddy_cache); |
2364 | err_freesgi: | 2371 | err_freesgi: |
2365 | kfree(sbi->s_group_info); | 2372 | ext4_kvfree(sbi->s_group_info); |
2366 | return -ENOMEM; | 2373 | return -ENOMEM; |
2367 | } | 2374 | } |
2368 | 2375 | ||
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size) | |||
2404 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, | 2411 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, |
2405 | NULL); | 2412 | NULL); |
2406 | 2413 | ||
2414 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2415 | |||
2407 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); | 2416 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); |
2408 | if (!cachep) { | 2417 | if (!cachep) { |
2409 | printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); | 2418 | printk(KERN_EMERG |
2419 | "EXT4-fs: no memory for groupinfo slab cache\n"); | ||
2410 | return -ENOMEM; | 2420 | return -ENOMEM; |
2411 | } | 2421 | } |
2412 | 2422 | ||
2413 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2414 | |||
2415 | return 0; | 2423 | return 0; |
2416 | } | 2424 | } |
2417 | 2425 | ||
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2457 | i++; | 2465 | i++; |
2458 | } while (i <= sb->s_blocksize_bits + 1); | 2466 | } while (i <= sb->s_blocksize_bits + 1); |
2459 | 2467 | ||
2460 | /* init file for buddy data */ | ||
2461 | ret = ext4_mb_init_backend(sb); | ||
2462 | if (ret != 0) { | ||
2463 | goto out; | ||
2464 | } | ||
2465 | |||
2466 | spin_lock_init(&sbi->s_md_lock); | 2468 | spin_lock_init(&sbi->s_md_lock); |
2467 | spin_lock_init(&sbi->s_bal_lock); | 2469 | spin_lock_init(&sbi->s_bal_lock); |
2468 | 2470 | ||
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2472 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; | 2474 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; |
2473 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; | 2475 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; |
2474 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | 2476 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; |
2477 | /* | ||
2478 | * If there is a s_stripe > 1, then we set the s_mb_group_prealloc | ||
2479 | * to the lowest multiple of s_stripe which is bigger than | ||
2480 | * the s_mb_group_prealloc as determined above. We want | ||
2481 | * the preallocation size to be an exact multiple of the | ||
2482 | * RAID stripe size so that preallocations don't fragment | ||
2483 | * the stripes. | ||
2484 | */ | ||
2485 | if (sbi->s_stripe > 1) { | ||
2486 | sbi->s_mb_group_prealloc = roundup( | ||
2487 | sbi->s_mb_group_prealloc, sbi->s_stripe); | ||
2488 | } | ||
2475 | 2489 | ||
2476 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); | 2490 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); |
2477 | if (sbi->s_locality_groups == NULL) { | 2491 | if (sbi->s_locality_groups == NULL) { |
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2487 | spin_lock_init(&lg->lg_prealloc_lock); | 2501 | spin_lock_init(&lg->lg_prealloc_lock); |
2488 | } | 2502 | } |
2489 | 2503 | ||
2504 | /* init file for buddy data */ | ||
2505 | ret = ext4_mb_init_backend(sb); | ||
2506 | if (ret != 0) { | ||
2507 | goto out; | ||
2508 | } | ||
2509 | |||
2490 | if (sbi->s_proc) | 2510 | if (sbi->s_proc) |
2491 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, | 2511 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, |
2492 | &ext4_mb_seq_groups_fops, sb); | 2512 | &ext4_mb_seq_groups_fops, sb); |
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb) | |||
2544 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2564 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2545 | for (i = 0; i < num_meta_group_infos; i++) | 2565 | for (i = 0; i < num_meta_group_infos; i++) |
2546 | kfree(sbi->s_group_info[i]); | 2566 | kfree(sbi->s_group_info[i]); |
2547 | kfree(sbi->s_group_info); | 2567 | ext4_kvfree(sbi->s_group_info); |
2548 | } | 2568 | } |
2549 | kfree(sbi->s_mb_offsets); | 2569 | kfree(sbi->s_mb_offsets); |
2550 | kfree(sbi->s_mb_maxs); | 2570 | kfree(sbi->s_mb_maxs); |
2551 | if (sbi->s_buddy_cache) | 2571 | if (sbi->s_buddy_cache) |
2552 | iput(sbi->s_buddy_cache); | 2572 | iput(sbi->s_buddy_cache); |
2553 | if (sbi->s_mb_stats) { | 2573 | if (sbi->s_mb_stats) { |
2554 | printk(KERN_INFO | 2574 | ext4_msg(sb, KERN_INFO, |
2555 | "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", | 2575 | "mballoc: %u blocks %u reqs (%u success)", |
2556 | atomic_read(&sbi->s_bal_allocated), | 2576 | atomic_read(&sbi->s_bal_allocated), |
2557 | atomic_read(&sbi->s_bal_reqs), | 2577 | atomic_read(&sbi->s_bal_reqs), |
2558 | atomic_read(&sbi->s_bal_success)); | 2578 | atomic_read(&sbi->s_bal_success)); |
2559 | printk(KERN_INFO | 2579 | ext4_msg(sb, KERN_INFO, |
2560 | "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " | 2580 | "mballoc: %u extents scanned, %u goal hits, " |
2561 | "%u 2^N hits, %u breaks, %u lost\n", | 2581 | "%u 2^N hits, %u breaks, %u lost", |
2562 | atomic_read(&sbi->s_bal_ex_scanned), | 2582 | atomic_read(&sbi->s_bal_ex_scanned), |
2563 | atomic_read(&sbi->s_bal_goals), | 2583 | atomic_read(&sbi->s_bal_goals), |
2564 | atomic_read(&sbi->s_bal_2orders), | 2584 | atomic_read(&sbi->s_bal_2orders), |
2565 | atomic_read(&sbi->s_bal_breaks), | 2585 | atomic_read(&sbi->s_bal_breaks), |
2566 | atomic_read(&sbi->s_mb_lost_chunks)); | 2586 | atomic_read(&sbi->s_mb_lost_chunks)); |
2567 | printk(KERN_INFO | 2587 | ext4_msg(sb, KERN_INFO, |
2568 | "EXT4-fs: mballoc: %lu generated and it took %Lu\n", | 2588 | "mballoc: %lu generated and it took %Lu", |
2569 | sbi->s_mb_buddies_generated++, | 2589 | sbi->s_mb_buddies_generated, |
2570 | sbi->s_mb_generation_time); | 2590 | sbi->s_mb_generation_time); |
2571 | printk(KERN_INFO | 2591 | ext4_msg(sb, KERN_INFO, |
2572 | "EXT4-fs: mballoc: %u preallocated, %u discarded\n", | 2592 | "mballoc: %u preallocated, %u discarded", |
2573 | atomic_read(&sbi->s_mb_preallocated), | 2593 | atomic_read(&sbi->s_mb_preallocated), |
2574 | atomic_read(&sbi->s_mb_discarded)); | 2594 | atomic_read(&sbi->s_mb_discarded)); |
2575 | } | 2595 | } |
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | |||
2628 | rb_erase(&entry->node, &(db->bb_free_root)); | 2648 | rb_erase(&entry->node, &(db->bb_free_root)); |
2629 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); | 2649 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); |
2630 | 2650 | ||
2651 | /* | ||
2652 | * Clear the trimmed flag for the group so that the next | ||
2653 | * ext4_trim_fs can trim it. | ||
2654 | * If the volume is mounted with -o discard, online discard | ||
2655 | * is supported and the free blocks will be trimmed online. | ||
2656 | */ | ||
2657 | if (!test_opt(sb, DISCARD)) | ||
2658 | EXT4_MB_GRP_CLEAR_TRIMMED(db); | ||
2659 | |||
2631 | if (!db->bb_free_root.rb_node) { | 2660 | if (!db->bb_free_root.rb_node) { |
2632 | /* No more items in the per group rb tree | 2661 | /* No more items in the per group rb tree |
2633 | * balance refcounts from ext4_mb_free_metadata() | 2662 | * balance refcounts from ext4_mb_free_metadata() |
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2771 | * We leak some of the blocks here. | 2800 | * We leak some of the blocks here. |
2772 | */ | 2801 | */ |
2773 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | 2802 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); |
2774 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, | 2803 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2775 | ac->ac_b_ex.fe_len); | 2804 | ac->ac_b_ex.fe_len); |
2776 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | 2805 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
2777 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); | 2806 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
2778 | if (!err) | 2807 | if (!err) |
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2790 | } | 2819 | } |
2791 | } | 2820 | } |
2792 | #endif | 2821 | #endif |
2793 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); | 2822 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2823 | ac->ac_b_ex.fe_len); | ||
2794 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 2824 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
2795 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | 2825 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
2796 | ext4_free_blks_set(sb, gdp, | 2826 | ext4_free_blks_set(sb, gdp, |
@@ -2830,8 +2860,9 @@ out_err: | |||
2830 | 2860 | ||
2831 | /* | 2861 | /* |
2832 | * here we normalize request for locality group | 2862 | * here we normalize request for locality group |
2833 | * Group request are normalized to s_strip size if we set the same via mount | 2863 | * Group request are normalized to s_mb_group_prealloc, which goes to |
2834 | * option. If not we set it to s_mb_group_prealloc which can be configured via | 2864 | * s_strip if we set the same via mount option. |
2865 | * s_mb_group_prealloc can be configured via | ||
2835 | * /sys/fs/ext4/<partition>/mb_group_prealloc | 2866 | * /sys/fs/ext4/<partition>/mb_group_prealloc |
2836 | * | 2867 | * |
2837 | * XXX: should we try to preallocate more than the group has now? | 2868 | * XXX: should we try to preallocate more than the group has now? |
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) | |||
2842 | struct ext4_locality_group *lg = ac->ac_lg; | 2873 | struct ext4_locality_group *lg = ac->ac_lg; |
2843 | 2874 | ||
2844 | BUG_ON(lg == NULL); | 2875 | BUG_ON(lg == NULL); |
2845 | if (EXT4_SB(sb)->s_stripe) | 2876 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; |
2846 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; | ||
2847 | else | ||
2848 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; | ||
2849 | mb_debug(1, "#%u: goal %u blocks for locality group\n", | 2877 | mb_debug(1, "#%u: goal %u blocks for locality group\n", |
2850 | current->pid, ac->ac_g_ex.fe_len); | 2878 | current->pid, ac->ac_g_ex.fe_len); |
2851 | } | 2879 | } |
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, | |||
3001 | 3029 | ||
3002 | if (start + size <= ac->ac_o_ex.fe_logical && | 3030 | if (start + size <= ac->ac_o_ex.fe_logical && |
3003 | start > ac->ac_o_ex.fe_logical) { | 3031 | start > ac->ac_o_ex.fe_logical) { |
3004 | printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", | 3032 | ext4_msg(ac->ac_sb, KERN_ERR, |
3005 | (unsigned long) start, (unsigned long) size, | 3033 | "start %lu, size %lu, fe_logical %lu", |
3006 | (unsigned long) ac->ac_o_ex.fe_logical); | 3034 | (unsigned long) start, (unsigned long) size, |
3035 | (unsigned long) ac->ac_o_ex.fe_logical); | ||
3007 | } | 3036 | } |
3008 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && | 3037 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && |
3009 | start > ac->ac_o_ex.fe_logical); | 3038 | start > ac->ac_o_ex.fe_logical); |
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | |||
3262 | 3291 | ||
3263 | while (n) { | 3292 | while (n) { |
3264 | entry = rb_entry(n, struct ext4_free_data, node); | 3293 | entry = rb_entry(n, struct ext4_free_data, node); |
3265 | mb_set_bits(bitmap, entry->start_blk, entry->count); | 3294 | ext4_set_bits(bitmap, entry->start_blk, entry->count); |
3266 | n = rb_next(n); | 3295 | n = rb_next(n); |
3267 | } | 3296 | } |
3268 | return; | 3297 | return; |
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |||
3304 | if (unlikely(len == 0)) | 3333 | if (unlikely(len == 0)) |
3305 | continue; | 3334 | continue; |
3306 | BUG_ON(groupnr != group); | 3335 | BUG_ON(groupnr != group); |
3307 | mb_set_bits(bitmap, start, len); | 3336 | ext4_set_bits(bitmap, start, len); |
3308 | preallocated += len; | 3337 | preallocated += len; |
3309 | count++; | 3338 | count++; |
3310 | } | 3339 | } |
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3584 | bit = next + 1; | 3613 | bit = next + 1; |
3585 | } | 3614 | } |
3586 | if (free != pa->pa_free) { | 3615 | if (free != pa->pa_free) { |
3587 | printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", | 3616 | ext4_msg(e4b->bd_sb, KERN_CRIT, |
3588 | pa, (unsigned long) pa->pa_lstart, | 3617 | "pa %p: logic %lu, phys. %lu, len %lu", |
3589 | (unsigned long) pa->pa_pstart, | 3618 | pa, (unsigned long) pa->pa_lstart, |
3590 | (unsigned long) pa->pa_len); | 3619 | (unsigned long) pa->pa_pstart, |
3620 | (unsigned long) pa->pa_len); | ||
3591 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", | 3621 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", |
3592 | free, pa->pa_free); | 3622 | free, pa->pa_free); |
3593 | /* | 3623 | /* |
@@ -3775,7 +3805,8 @@ repeat: | |||
3775 | * use preallocation while we're discarding it */ | 3805 | * use preallocation while we're discarding it */ |
3776 | spin_unlock(&pa->pa_lock); | 3806 | spin_unlock(&pa->pa_lock); |
3777 | spin_unlock(&ei->i_prealloc_lock); | 3807 | spin_unlock(&ei->i_prealloc_lock); |
3778 | printk(KERN_ERR "uh-oh! used pa while discarding\n"); | 3808 | ext4_msg(sb, KERN_ERR, |
3809 | "uh-oh! used pa while discarding"); | ||
3779 | WARN_ON(1); | 3810 | WARN_ON(1); |
3780 | schedule_timeout_uninterruptible(HZ); | 3811 | schedule_timeout_uninterruptible(HZ); |
3781 | goto repeat; | 3812 | goto repeat; |
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3852 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) | 3883 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) |
3853 | return; | 3884 | return; |
3854 | 3885 | ||
3855 | printk(KERN_ERR "EXT4-fs: Can't allocate:" | 3886 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" |
3856 | " Allocation context details:\n"); | 3887 | " Allocation context details:"); |
3857 | printk(KERN_ERR "EXT4-fs: status %d flags %d\n", | 3888 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", |
3858 | ac->ac_status, ac->ac_flags); | 3889 | ac->ac_status, ac->ac_flags); |
3859 | printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " | 3890 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " |
3860 | "best %lu/%lu/%lu@%lu cr %d\n", | 3891 | "goal %lu/%lu/%lu@%lu, " |
3892 | "best %lu/%lu/%lu@%lu cr %d", | ||
3861 | (unsigned long)ac->ac_o_ex.fe_group, | 3893 | (unsigned long)ac->ac_o_ex.fe_group, |
3862 | (unsigned long)ac->ac_o_ex.fe_start, | 3894 | (unsigned long)ac->ac_o_ex.fe_start, |
3863 | (unsigned long)ac->ac_o_ex.fe_len, | 3895 | (unsigned long)ac->ac_o_ex.fe_len, |
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3871 | (unsigned long)ac->ac_b_ex.fe_len, | 3903 | (unsigned long)ac->ac_b_ex.fe_len, |
3872 | (unsigned long)ac->ac_b_ex.fe_logical, | 3904 | (unsigned long)ac->ac_b_ex.fe_logical, |
3873 | (int)ac->ac_criteria); | 3905 | (int)ac->ac_criteria); |
3874 | printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, | 3906 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", |
3875 | ac->ac_found); | 3907 | ac->ac_ex_scanned, ac->ac_found); |
3876 | printk(KERN_ERR "EXT4-fs: groups: \n"); | 3908 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); |
3877 | ngroups = ext4_get_groups_count(sb); | 3909 | ngroups = ext4_get_groups_count(sb); |
3878 | for (i = 0; i < ngroups; i++) { | 3910 | for (i = 0; i < ngroups; i++) { |
3879 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); | 3911 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); |
@@ -4637,7 +4669,7 @@ do_more: | |||
4637 | } | 4669 | } |
4638 | ext4_mark_super_dirty(sb); | 4670 | ext4_mark_super_dirty(sb); |
4639 | error_return: | 4671 | error_return: |
4640 | if (freed) | 4672 | if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) |
4641 | dquot_free_block(inode, freed); | 4673 | dquot_free_block(inode, freed); |
4642 | brelse(bitmap_bh); | 4674 | brelse(bitmap_bh); |
4643 | ext4_std_error(sb, err); | 4675 | ext4_std_error(sb, err); |
@@ -4645,7 +4677,7 @@ error_return: | |||
4645 | } | 4677 | } |
4646 | 4678 | ||
4647 | /** | 4679 | /** |
4648 | * ext4_add_groupblocks() -- Add given blocks to an existing group | 4680 | * ext4_group_add_blocks() -- Add given blocks to an existing group |
4649 | * @handle: handle to this transaction | 4681 | * @handle: handle to this transaction |
4650 | * @sb: super block | 4682 | * @sb: super block |
4651 | * @block: start physcial block to add to the block group | 4683 | * @block: start physcial block to add to the block group |
@@ -4653,7 +4685,7 @@ error_return: | |||
4653 | * | 4685 | * |
4654 | * This marks the blocks as free in the bitmap and buddy. | 4686 | * This marks the blocks as free in the bitmap and buddy. |
4655 | */ | 4687 | */ |
4656 | void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 4688 | int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
4657 | ext4_fsblk_t block, unsigned long count) | 4689 | ext4_fsblk_t block, unsigned long count) |
4658 | { | 4690 | { |
4659 | struct buffer_head *bitmap_bh = NULL; | 4691 | struct buffer_head *bitmap_bh = NULL; |
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4666 | struct ext4_buddy e4b; | 4698 | struct ext4_buddy e4b; |
4667 | int err = 0, ret, blk_free_count; | 4699 | int err = 0, ret, blk_free_count; |
4668 | ext4_grpblk_t blocks_freed; | 4700 | ext4_grpblk_t blocks_freed; |
4669 | struct ext4_group_info *grp; | ||
4670 | 4701 | ||
4671 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); | 4702 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); |
4672 | 4703 | ||
4704 | if (count == 0) | ||
4705 | return 0; | ||
4706 | |||
4673 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | 4707 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); |
4674 | grp = ext4_get_group_info(sb, block_group); | ||
4675 | /* | 4708 | /* |
4676 | * Check to see if we are freeing blocks across a group | 4709 | * Check to see if we are freeing blocks across a group |
4677 | * boundary. | 4710 | * boundary. |
4678 | */ | 4711 | */ |
4679 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) | 4712 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { |
4713 | ext4_warning(sb, "too much blocks added to group %u\n", | ||
4714 | block_group); | ||
4715 | err = -EINVAL; | ||
4680 | goto error_return; | 4716 | goto error_return; |
4717 | } | ||
4681 | 4718 | ||
4682 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); | 4719 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
4683 | if (!bitmap_bh) | 4720 | if (!bitmap_bh) { |
4721 | err = -EIO; | ||
4684 | goto error_return; | 4722 | goto error_return; |
4723 | } | ||
4724 | |||
4685 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); | 4725 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); |
4686 | if (!desc) | 4726 | if (!desc) { |
4727 | err = -EIO; | ||
4687 | goto error_return; | 4728 | goto error_return; |
4729 | } | ||
4688 | 4730 | ||
4689 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || | 4731 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || |
4690 | in_range(ext4_inode_bitmap(sb, desc), block, count) || | 4732 | in_range(ext4_inode_bitmap(sb, desc), block, count) || |
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4694 | ext4_error(sb, "Adding blocks in system zones - " | 4736 | ext4_error(sb, "Adding blocks in system zones - " |
4695 | "Block = %llu, count = %lu", | 4737 | "Block = %llu, count = %lu", |
4696 | block, count); | 4738 | block, count); |
4739 | err = -EINVAL; | ||
4697 | goto error_return; | 4740 | goto error_return; |
4698 | } | 4741 | } |
4699 | 4742 | ||
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4762 | error_return: | 4805 | error_return: |
4763 | brelse(bitmap_bh); | 4806 | brelse(bitmap_bh); |
4764 | ext4_std_error(sb, err); | 4807 | ext4_std_error(sb, err); |
4765 | return; | 4808 | return err; |
4766 | } | 4809 | } |
4767 | 4810 | ||
4768 | /** | 4811 | /** |
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4782 | { | 4825 | { |
4783 | struct ext4_free_extent ex; | 4826 | struct ext4_free_extent ex; |
4784 | 4827 | ||
4828 | trace_ext4_trim_extent(sb, group, start, count); | ||
4829 | |||
4785 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); | 4830 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); |
4786 | 4831 | ||
4787 | ex.fe_start = start; | 4832 | ex.fe_start = start; |
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4802 | /** | 4847 | /** |
4803 | * ext4_trim_all_free -- function to trim all free space in alloc. group | 4848 | * ext4_trim_all_free -- function to trim all free space in alloc. group |
4804 | * @sb: super block for file system | 4849 | * @sb: super block for file system |
4805 | * @e4b: ext4 buddy | 4850 | * @group: group to be trimmed |
4806 | * @start: first group block to examine | 4851 | * @start: first group block to examine |
4807 | * @max: last group block to examine | 4852 | * @max: last group block to examine |
4808 | * @minblocks: minimum extent block count | 4853 | * @minblocks: minimum extent block count |
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4823 | ext4_grpblk_t minblocks) | 4868 | ext4_grpblk_t minblocks) |
4824 | { | 4869 | { |
4825 | void *bitmap; | 4870 | void *bitmap; |
4826 | ext4_grpblk_t next, count = 0; | 4871 | ext4_grpblk_t next, count = 0, free_count = 0; |
4827 | struct ext4_buddy e4b; | 4872 | struct ext4_buddy e4b; |
4828 | int ret; | 4873 | int ret; |
4829 | 4874 | ||
4875 | trace_ext4_trim_all_free(sb, group, start, max); | ||
4876 | |||
4830 | ret = ext4_mb_load_buddy(sb, group, &e4b); | 4877 | ret = ext4_mb_load_buddy(sb, group, &e4b); |
4831 | if (ret) { | 4878 | if (ret) { |
4832 | ext4_error(sb, "Error in loading buddy " | 4879 | ext4_error(sb, "Error in loading buddy " |
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4836 | bitmap = e4b.bd_bitmap; | 4883 | bitmap = e4b.bd_bitmap; |
4837 | 4884 | ||
4838 | ext4_lock_group(sb, group); | 4885 | ext4_lock_group(sb, group); |
4886 | if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && | ||
4887 | minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) | ||
4888 | goto out; | ||
4889 | |||
4839 | start = (e4b.bd_info->bb_first_free > start) ? | 4890 | start = (e4b.bd_info->bb_first_free > start) ? |
4840 | e4b.bd_info->bb_first_free : start; | 4891 | e4b.bd_info->bb_first_free : start; |
4841 | 4892 | ||
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4850 | next - start, group, &e4b); | 4901 | next - start, group, &e4b); |
4851 | count += next - start; | 4902 | count += next - start; |
4852 | } | 4903 | } |
4904 | free_count += next - start; | ||
4853 | start = next + 1; | 4905 | start = next + 1; |
4854 | 4906 | ||
4855 | if (fatal_signal_pending(current)) { | 4907 | if (fatal_signal_pending(current)) { |
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4863 | ext4_lock_group(sb, group); | 4915 | ext4_lock_group(sb, group); |
4864 | } | 4916 | } |
4865 | 4917 | ||
4866 | if ((e4b.bd_info->bb_free - count) < minblocks) | 4918 | if ((e4b.bd_info->bb_free - free_count) < minblocks) |
4867 | break; | 4919 | break; |
4868 | } | 4920 | } |
4921 | |||
4922 | if (!ret) | ||
4923 | EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); | ||
4924 | out: | ||
4869 | ext4_unlock_group(sb, group); | 4925 | ext4_unlock_group(sb, group); |
4870 | ext4_mb_unload_buddy(&e4b); | 4926 | ext4_mb_unload_buddy(&e4b); |
4871 | 4927 | ||
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4904 | 4960 | ||
4905 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) | 4961 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) |
4906 | return -EINVAL; | 4962 | return -EINVAL; |
4963 | if (start + len <= first_data_blk) | ||
4964 | goto out; | ||
4907 | if (start < first_data_blk) { | 4965 | if (start < first_data_blk) { |
4908 | len -= first_data_blk - start; | 4966 | len -= first_data_blk - start; |
4909 | start = first_data_blk; | 4967 | start = first_data_blk; |
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4952 | } | 5010 | } |
4953 | range->len = trimmed * sb->s_blocksize; | 5011 | range->len = trimmed * sb->s_blocksize; |
4954 | 5012 | ||
5013 | if (!ret) | ||
5014 | atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); | ||
5015 | |||
5016 | out: | ||
4955 | return ret; | 5017 | return ret; |
4956 | } | 5018 | } |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd1..9d4a636b546c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -187,7 +187,6 @@ struct ext4_allocation_context { | |||
187 | __u16 ac_flags; /* allocation hints */ | 187 | __u16 ac_flags; /* allocation hints */ |
188 | __u8 ac_status; | 188 | __u8 ac_status; |
189 | __u8 ac_criteria; | 189 | __u8 ac_criteria; |
190 | __u8 ac_repeats; | ||
191 | __u8 ac_2order; /* if request is to allocate 2^N blocks and | 190 | __u8 ac_2order; /* if request is to allocate 2^N blocks and |
192 | * N > 0, the field stores N, otherwise 0 */ | 191 | * N > 0, the field stores N, otherwise 0 */ |
193 | __u8 ac_op; /* operation, for history only */ | 192 | __u8 ac_op; /* operation, for history only */ |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8c9babac43dc..565a154e22d4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent | |||
289 | while (len--) printk("%c", *name++); | 289 | while (len--) printk("%c", *name++); |
290 | ext4fs_dirhash(de->name, de->name_len, &h); | 290 | ext4fs_dirhash(de->name, de->name_len, &h); |
291 | printk(":%x.%u ", h.hash, | 291 | printk(":%x.%u ", h.hash, |
292 | ((char *) de - base)); | 292 | (unsigned) ((char *) de - base)); |
293 | } | 293 | } |
294 | space += EXT4_DIR_REC_LEN(de->name_len); | 294 | space += EXT4_DIR_REC_LEN(de->name_len); |
295 | names++; | 295 | names++; |
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q | |||
1013 | 1013 | ||
1014 | *err = -ENOENT; | 1014 | *err = -ENOENT; |
1015 | errout: | 1015 | errout: |
1016 | dxtrace(printk(KERN_DEBUG "%s not found\n", name)); | 1016 | dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); |
1017 | dx_release (frames); | 1017 | dx_release (frames); |
1018 | return NULL; | 1018 | return NULL; |
1019 | } | 1019 | } |
@@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) | |||
1985 | if (!list_empty(&EXT4_I(inode)->i_orphan)) | 1985 | if (!list_empty(&EXT4_I(inode)->i_orphan)) |
1986 | goto out_unlock; | 1986 | goto out_unlock; |
1987 | 1987 | ||
1988 | /* Orphan handling is only valid for files with data blocks | 1988 | /* |
1989 | * being truncated, or files being unlinked. */ | 1989 | * Orphan handling is only valid for files with data blocks |
1990 | 1990 | * being truncated, or files being unlinked. Note that we either | |
1991 | /* @@@ FIXME: Observation from aviro: | 1991 | * hold i_mutex, or the inode can not be referenced from outside, |
1992 | * I think I can trigger J_ASSERT in ext4_orphan_add(). We block | 1992 | * so i_nlink should not be bumped due to race |
1993 | * here (on s_orphan_lock), so race with ext4_link() which might bump | ||
1994 | * ->i_nlink. For, say it, character device. Not a regular file, | ||
1995 | * not a directory, not a symlink and ->i_nlink > 0. | ||
1996 | * | ||
1997 | * tytso, 4/25/2009: I'm not sure how that could happen; | ||
1998 | * shouldn't the fs core protect us from these sort of | ||
1999 | * unlink()/link() races? | ||
2000 | */ | 1993 | */ |
2001 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 1994 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
2002 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); | 1995 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470a..430c401d0895 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io, | |||
285 | io_end = ext4_init_io_end(inode, GFP_NOFS); | 285 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
286 | if (!io_end) | 286 | if (!io_end) |
287 | return -ENOMEM; | 287 | return -ENOMEM; |
288 | do { | 288 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); |
289 | bio = bio_alloc(GFP_NOIO, nvecs); | ||
290 | nvecs >>= 1; | ||
291 | } while (bio == NULL); | ||
292 | |||
293 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 289 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
294 | bio->bi_bdev = bh->b_bdev; | 290 | bio->bi_bdev = bh->b_bdev; |
295 | bio->bi_private = io->io_end = io_end; | 291 | bio->bi_private = io->io_end = io_end; |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c60c24..707d3f16f7ce 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -16,6 +16,35 @@ | |||
16 | 16 | ||
17 | #include "ext4_jbd2.h" | 17 | #include "ext4_jbd2.h" |
18 | 18 | ||
19 | int ext4_resize_begin(struct super_block *sb) | ||
20 | { | ||
21 | int ret = 0; | ||
22 | |||
23 | if (!capable(CAP_SYS_RESOURCE)) | ||
24 | return -EPERM; | ||
25 | |||
26 | /* | ||
27 | * We are not allowed to do online-resizing on a filesystem mounted | ||
28 | * with error, because it can destroy the filesystem easily. | ||
29 | */ | ||
30 | if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { | ||
31 | ext4_warning(sb, "There are errors in the filesystem, " | ||
32 | "so online resizing is not allowed\n"); | ||
33 | return -EPERM; | ||
34 | } | ||
35 | |||
36 | if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) | ||
37 | ret = -EBUSY; | ||
38 | |||
39 | return ret; | ||
40 | } | ||
41 | |||
42 | void ext4_resize_end(struct super_block *sb) | ||
43 | { | ||
44 | clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); | ||
45 | smp_mb__after_clear_bit(); | ||
46 | } | ||
47 | |||
19 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) | 48 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) |
20 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) | 49 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) |
21 | 50 | ||
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
118 | brelse(bh); | 147 | brelse(bh); |
119 | bh = ERR_PTR(err); | 148 | bh = ERR_PTR(err); |
120 | } else { | 149 | } else { |
121 | lock_buffer(bh); | ||
122 | memset(bh->b_data, 0, sb->s_blocksize); | 150 | memset(bh->b_data, 0, sb->s_blocksize); |
123 | set_buffer_uptodate(bh); | 151 | set_buffer_uptodate(bh); |
124 | unlock_buffer(bh); | ||
125 | } | 152 | } |
126 | 153 | ||
127 | return bh; | 154 | return bh; |
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
132 | * If that fails, restart the transaction & regain write access for the | 159 | * If that fails, restart the transaction & regain write access for the |
133 | * buffer head which is used for block_bitmap modifications. | 160 | * buffer head which is used for block_bitmap modifications. |
134 | */ | 161 | */ |
135 | static int extend_or_restart_transaction(handle_t *handle, int thresh, | 162 | static int extend_or_restart_transaction(handle_t *handle, int thresh) |
136 | struct buffer_head *bh) | ||
137 | { | 163 | { |
138 | int err; | 164 | int err; |
139 | 165 | ||
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, | |||
144 | if (err < 0) | 170 | if (err < 0) |
145 | return err; | 171 | return err; |
146 | if (err) { | 172 | if (err) { |
147 | if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) | 173 | err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); |
148 | return err; | 174 | if (err) |
149 | if ((err = ext4_journal_get_write_access(handle, bh))) | ||
150 | return err; | 175 | return err; |
151 | } | 176 | } |
152 | 177 | ||
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
181 | if (IS_ERR(handle)) | 206 | if (IS_ERR(handle)) |
182 | return PTR_ERR(handle); | 207 | return PTR_ERR(handle); |
183 | 208 | ||
184 | mutex_lock(&sbi->s_resize_lock); | 209 | BUG_ON(input->group != sbi->s_groups_count); |
185 | if (input->group != sbi->s_groups_count) { | ||
186 | err = -EBUSY; | ||
187 | goto exit_journal; | ||
188 | } | ||
189 | |||
190 | if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { | ||
191 | err = PTR_ERR(bh); | ||
192 | goto exit_journal; | ||
193 | } | ||
194 | |||
195 | if (ext4_bg_has_super(sb, input->group)) { | ||
196 | ext4_debug("mark backup superblock %#04llx (+0)\n", start); | ||
197 | ext4_set_bit(0, bh->b_data); | ||
198 | } | ||
199 | 210 | ||
200 | /* Copy all of the GDT blocks into the backup in this group */ | 211 | /* Copy all of the GDT blocks into the backup in this group */ |
201 | for (i = 0, bit = 1, block = start + 1; | 212 | for (i = 0, bit = 1, block = start + 1; |
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
203 | struct buffer_head *gdb; | 214 | struct buffer_head *gdb; |
204 | 215 | ||
205 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); | 216 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); |
206 | 217 | err = extend_or_restart_transaction(handle, 1); | |
207 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 218 | if (err) |
208 | goto exit_bh; | 219 | goto exit_journal; |
209 | 220 | ||
210 | gdb = sb_getblk(sb, block); | 221 | gdb = sb_getblk(sb, block); |
211 | if (!gdb) { | 222 | if (!gdb) { |
212 | err = -EIO; | 223 | err = -EIO; |
213 | goto exit_bh; | 224 | goto exit_journal; |
214 | } | 225 | } |
215 | if ((err = ext4_journal_get_write_access(handle, gdb))) { | 226 | if ((err = ext4_journal_get_write_access(handle, gdb))) { |
216 | brelse(gdb); | 227 | brelse(gdb); |
217 | goto exit_bh; | 228 | goto exit_journal; |
218 | } | 229 | } |
219 | lock_buffer(gdb); | ||
220 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); | 230 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); |
221 | set_buffer_uptodate(gdb); | 231 | set_buffer_uptodate(gdb); |
222 | unlock_buffer(gdb); | ||
223 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); | 232 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); |
224 | if (unlikely(err)) { | 233 | if (unlikely(err)) { |
225 | brelse(gdb); | 234 | brelse(gdb); |
226 | goto exit_bh; | 235 | goto exit_journal; |
227 | } | 236 | } |
228 | ext4_set_bit(bit, bh->b_data); | ||
229 | brelse(gdb); | 237 | brelse(gdb); |
230 | } | 238 | } |
231 | 239 | ||
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
235 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, | 243 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, |
236 | GFP_NOFS); | 244 | GFP_NOFS); |
237 | if (err) | 245 | if (err) |
238 | goto exit_bh; | 246 | goto exit_journal; |
239 | for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) | 247 | |
240 | ext4_set_bit(bit, bh->b_data); | 248 | err = extend_or_restart_transaction(handle, 2); |
249 | if (err) | ||
250 | goto exit_journal; | ||
251 | |||
252 | bh = bclean(handle, sb, input->block_bitmap); | ||
253 | if (IS_ERR(bh)) { | ||
254 | err = PTR_ERR(bh); | ||
255 | goto exit_journal; | ||
256 | } | ||
257 | |||
258 | if (ext4_bg_has_super(sb, input->group)) { | ||
259 | ext4_debug("mark backup group tables %#04llx (+0)\n", start); | ||
260 | ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); | ||
261 | } | ||
241 | 262 | ||
242 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, | 263 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, |
243 | input->block_bitmap - start); | 264 | input->block_bitmap - start); |
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
253 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); | 274 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); |
254 | if (err) | 275 | if (err) |
255 | goto exit_bh; | 276 | goto exit_bh; |
256 | for (i = 0, bit = input->inode_table - start; | 277 | ext4_set_bits(bh->b_data, input->inode_table - start, |
257 | i < sbi->s_itb_per_group; i++, bit++) | 278 | sbi->s_itb_per_group); |
258 | ext4_set_bit(bit, bh->b_data); | ||
259 | 279 | ||
260 | if ((err = extend_or_restart_transaction(handle, 2, bh))) | ||
261 | goto exit_bh; | ||
262 | 280 | ||
263 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, | 281 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, |
264 | bh->b_data); | 282 | bh->b_data); |
@@ -285,7 +303,6 @@ exit_bh: | |||
285 | brelse(bh); | 303 | brelse(bh); |
286 | 304 | ||
287 | exit_journal: | 305 | exit_journal: |
288 | mutex_unlock(&sbi->s_resize_lock); | ||
289 | if ((err2 = ext4_journal_stop(handle)) && !err) | 306 | if ((err2 = ext4_journal_stop(handle)) && !err) |
290 | err = err2; | 307 | err = err2; |
291 | 308 | ||
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, | |||
377 | * fail once we start modifying the data on disk, because JBD has no rollback. | 394 | * fail once we start modifying the data on disk, because JBD has no rollback. |
378 | */ | 395 | */ |
379 | static int add_new_gdb(handle_t *handle, struct inode *inode, | 396 | static int add_new_gdb(handle_t *handle, struct inode *inode, |
380 | struct ext4_new_group_data *input, | 397 | ext4_group_t group) |
381 | struct buffer_head **primary) | ||
382 | { | 398 | { |
383 | struct super_block *sb = inode->i_sb; | 399 | struct super_block *sb = inode->i_sb; |
384 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | 400 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
385 | unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | 401 | unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); |
386 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; | 402 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; |
387 | struct buffer_head **o_group_desc, **n_group_desc; | 403 | struct buffer_head **o_group_desc, **n_group_desc; |
388 | struct buffer_head *dind; | 404 | struct buffer_head *dind; |
405 | struct buffer_head *gdb_bh; | ||
389 | int gdbackups; | 406 | int gdbackups; |
390 | struct ext4_iloc iloc; | 407 | struct ext4_iloc iloc; |
391 | __le32 *data; | 408 | __le32 *data; |
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
408 | return -EPERM; | 425 | return -EPERM; |
409 | } | 426 | } |
410 | 427 | ||
411 | *primary = sb_bread(sb, gdblock); | 428 | gdb_bh = sb_bread(sb, gdblock); |
412 | if (!*primary) | 429 | if (!gdb_bh) |
413 | return -EIO; | 430 | return -EIO; |
414 | 431 | ||
415 | if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { | 432 | gdbackups = verify_reserved_gdb(sb, gdb_bh); |
433 | if (gdbackups < 0) { | ||
416 | err = gdbackups; | 434 | err = gdbackups; |
417 | goto exit_bh; | 435 | goto exit_bh; |
418 | } | 436 | } |
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
427 | data = (__le32 *)dind->b_data; | 445 | data = (__le32 *)dind->b_data; |
428 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { | 446 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { |
429 | ext4_warning(sb, "new group %u GDT block %llu not reserved", | 447 | ext4_warning(sb, "new group %u GDT block %llu not reserved", |
430 | input->group, gdblock); | 448 | group, gdblock); |
431 | err = -EINVAL; | 449 | err = -EINVAL; |
432 | goto exit_dind; | 450 | goto exit_dind; |
433 | } | 451 | } |
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
436 | if (unlikely(err)) | 454 | if (unlikely(err)) |
437 | goto exit_dind; | 455 | goto exit_dind; |
438 | 456 | ||
439 | err = ext4_journal_get_write_access(handle, *primary); | 457 | err = ext4_journal_get_write_access(handle, gdb_bh); |
440 | if (unlikely(err)) | 458 | if (unlikely(err)) |
441 | goto exit_sbh; | 459 | goto exit_sbh; |
442 | 460 | ||
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
449 | if (unlikely(err)) | 467 | if (unlikely(err)) |
450 | goto exit_dindj; | 468 | goto exit_dindj; |
451 | 469 | ||
452 | n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), | 470 | n_group_desc = ext4_kvmalloc((gdb_num + 1) * |
453 | GFP_NOFS); | 471 | sizeof(struct buffer_head *), |
472 | GFP_NOFS); | ||
454 | if (!n_group_desc) { | 473 | if (!n_group_desc) { |
455 | err = -ENOMEM; | 474 | err = -ENOMEM; |
456 | ext4_warning(sb, | 475 | ext4_warning(sb, "not enough memory for %lu groups", |
457 | "not enough memory for %lu groups", gdb_num + 1); | 476 | gdb_num + 1); |
458 | goto exit_inode; | 477 | goto exit_inode; |
459 | } | 478 | } |
460 | 479 | ||
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
475 | } | 494 | } |
476 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; | 495 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; |
477 | ext4_mark_iloc_dirty(handle, inode, &iloc); | 496 | ext4_mark_iloc_dirty(handle, inode, &iloc); |
478 | memset((*primary)->b_data, 0, sb->s_blocksize); | 497 | memset(gdb_bh->b_data, 0, sb->s_blocksize); |
479 | err = ext4_handle_dirty_metadata(handle, NULL, *primary); | 498 | err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); |
480 | if (unlikely(err)) { | 499 | if (unlikely(err)) { |
481 | ext4_std_error(sb, err); | 500 | ext4_std_error(sb, err); |
482 | goto exit_inode; | 501 | goto exit_inode; |
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
486 | o_group_desc = EXT4_SB(sb)->s_group_desc; | 505 | o_group_desc = EXT4_SB(sb)->s_group_desc; |
487 | memcpy(n_group_desc, o_group_desc, | 506 | memcpy(n_group_desc, o_group_desc, |
488 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); | 507 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); |
489 | n_group_desc[gdb_num] = *primary; | 508 | n_group_desc[gdb_num] = gdb_bh; |
490 | EXT4_SB(sb)->s_group_desc = n_group_desc; | 509 | EXT4_SB(sb)->s_group_desc = n_group_desc; |
491 | EXT4_SB(sb)->s_gdb_count++; | 510 | EXT4_SB(sb)->s_gdb_count++; |
492 | kfree(o_group_desc); | 511 | ext4_kvfree(o_group_desc); |
493 | 512 | ||
494 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); | 513 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); |
495 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); | 514 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
499 | return err; | 518 | return err; |
500 | 519 | ||
501 | exit_inode: | 520 | exit_inode: |
521 | ext4_kvfree(n_group_desc); | ||
502 | /* ext4_handle_release_buffer(handle, iloc.bh); */ | 522 | /* ext4_handle_release_buffer(handle, iloc.bh); */ |
503 | brelse(iloc.bh); | 523 | brelse(iloc.bh); |
504 | exit_dindj: | 524 | exit_dindj: |
@@ -508,7 +528,7 @@ exit_sbh: | |||
508 | exit_dind: | 528 | exit_dind: |
509 | brelse(dind); | 529 | brelse(dind); |
510 | exit_bh: | 530 | exit_bh: |
511 | brelse(*primary); | 531 | brelse(gdb_bh); |
512 | 532 | ||
513 | ext4_debug("leaving with error %d\n", err); | 533 | ext4_debug("leaving with error %d\n", err); |
514 | return err; | 534 | return err; |
@@ -528,7 +548,7 @@ exit_bh: | |||
528 | * backup GDT blocks are stored in their reserved primary GDT block. | 548 | * backup GDT blocks are stored in their reserved primary GDT block. |
529 | */ | 549 | */ |
530 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | 550 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, |
531 | struct ext4_new_group_data *input) | 551 | ext4_group_t group) |
532 | { | 552 | { |
533 | struct super_block *sb = inode->i_sb; | 553 | struct super_block *sb = inode->i_sb; |
534 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); | 554 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); |
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | |||
599 | * Finally we can add each of the reserved backup GDT blocks from | 619 | * Finally we can add each of the reserved backup GDT blocks from |
600 | * the new group to its reserved primary GDT block. | 620 | * the new group to its reserved primary GDT block. |
601 | */ | 621 | */ |
602 | blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); | 622 | blk = group * EXT4_BLOCKS_PER_GROUP(sb); |
603 | for (i = 0; i < reserved_gdb; i++) { | 623 | for (i = 0; i < reserved_gdb; i++) { |
604 | int err2; | 624 | int err2; |
605 | data = (__le32 *)primary[i]->b_data; | 625 | data = (__le32 *)primary[i]->b_data; |
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
799 | goto exit_put; | 819 | goto exit_put; |
800 | } | 820 | } |
801 | 821 | ||
802 | mutex_lock(&sbi->s_resize_lock); | ||
803 | if (input->group != sbi->s_groups_count) { | ||
804 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
805 | err = -EBUSY; | ||
806 | goto exit_journal; | ||
807 | } | ||
808 | |||
809 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) | 822 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) |
810 | goto exit_journal; | 823 | goto exit_journal; |
811 | 824 | ||
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
820 | if ((err = ext4_journal_get_write_access(handle, primary))) | 833 | if ((err = ext4_journal_get_write_access(handle, primary))) |
821 | goto exit_journal; | 834 | goto exit_journal; |
822 | 835 | ||
823 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && | 836 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { |
824 | (err = reserve_backup_gdb(handle, inode, input))) | 837 | err = reserve_backup_gdb(handle, inode, input->group); |
838 | if (err) | ||
839 | goto exit_journal; | ||
840 | } | ||
841 | } else { | ||
842 | /* | ||
843 | * Note that we can access new group descriptor block safely | ||
844 | * only if add_new_gdb() succeeds. | ||
845 | */ | ||
846 | err = add_new_gdb(handle, inode, input->group); | ||
847 | if (err) | ||
825 | goto exit_journal; | 848 | goto exit_journal; |
826 | } else if ((err = add_new_gdb(handle, inode, input, &primary))) | 849 | primary = sbi->s_group_desc[gdb_num]; |
827 | goto exit_journal; | 850 | } |
828 | 851 | ||
829 | /* | 852 | /* |
830 | * OK, now we've set up the new group. Time to make it active. | 853 | * OK, now we've set up the new group. Time to make it active. |
831 | * | 854 | * |
832 | * We do not lock all allocations via s_resize_lock | ||
833 | * so we have to be safe wrt. concurrent accesses the group | 855 | * so we have to be safe wrt. concurrent accesses the group |
834 | * data. So we need to be careful to set all of the relevant | 856 | * data. So we need to be careful to set all of the relevant |
835 | * group descriptor data etc. *before* we enable the group. | 857 | * group descriptor data etc. *before* we enable the group. |
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
886 | * | 908 | * |
887 | * The precise rules we use are: | 909 | * The precise rules we use are: |
888 | * | 910 | * |
889 | * * Writers of s_groups_count *must* hold s_resize_lock | ||
890 | * AND | ||
891 | * * Writers must perform a smp_wmb() after updating all dependent | 911 | * * Writers must perform a smp_wmb() after updating all dependent |
892 | * data and before modifying the groups count | 912 | * data and before modifying the groups count |
893 | * | 913 | * |
894 | * * Readers must hold s_resize_lock over the access | ||
895 | * OR | ||
896 | * * Readers must perform an smp_rmb() after reading the groups count | 914 | * * Readers must perform an smp_rmb() after reading the groups count |
897 | * and before reading any dependent data. | 915 | * and before reading any dependent data. |
898 | * | 916 | * |
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
937 | ext4_handle_dirty_super(handle, sb); | 955 | ext4_handle_dirty_super(handle, sb); |
938 | 956 | ||
939 | exit_journal: | 957 | exit_journal: |
940 | mutex_unlock(&sbi->s_resize_lock); | ||
941 | if ((err2 = ext4_journal_stop(handle)) && !err) | 958 | if ((err2 = ext4_journal_stop(handle)) && !err) |
942 | err = err2; | 959 | err = err2; |
943 | if (!err) { | 960 | if (!err && primary) { |
944 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, | 961 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, |
945 | sizeof(struct ext4_super_block)); | 962 | sizeof(struct ext4_super_block)); |
946 | update_backups(sb, primary->b_blocknr, primary->b_data, | 963 | update_backups(sb, primary->b_blocknr, primary->b_data, |
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
969 | ext4_grpblk_t add; | 986 | ext4_grpblk_t add; |
970 | struct buffer_head *bh; | 987 | struct buffer_head *bh; |
971 | handle_t *handle; | 988 | handle_t *handle; |
972 | int err; | 989 | int err, err2; |
973 | ext4_group_t group; | 990 | ext4_group_t group; |
974 | 991 | ||
975 | /* We don't need to worry about locking wrt other resizers just | ||
976 | * yet: we're going to revalidate es->s_blocks_count after | ||
977 | * taking the s_resize_lock below. */ | ||
978 | o_blocks_count = ext4_blocks_count(es); | 992 | o_blocks_count = ext4_blocks_count(es); |
979 | 993 | ||
980 | if (test_opt(sb, DEBUG)) | 994 | if (test_opt(sb, DEBUG)) |
981 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", | 995 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", |
982 | o_blocks_count, n_blocks_count); | 996 | o_blocks_count, n_blocks_count); |
983 | 997 | ||
984 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) | 998 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) |
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
995 | 1009 | ||
996 | if (n_blocks_count < o_blocks_count) { | 1010 | if (n_blocks_count < o_blocks_count) { |
997 | ext4_warning(sb, "can't shrink FS - resize aborted"); | 1011 | ext4_warning(sb, "can't shrink FS - resize aborted"); |
998 | return -EBUSY; | 1012 | return -EINVAL; |
999 | } | 1013 | } |
1000 | 1014 | ||
1001 | /* Handle the remaining blocks in the last group only. */ | 1015 | /* Handle the remaining blocks in the last group only. */ |
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1038 | goto exit_put; | 1052 | goto exit_put; |
1039 | } | 1053 | } |
1040 | 1054 | ||
1041 | mutex_lock(&EXT4_SB(sb)->s_resize_lock); | ||
1042 | if (o_blocks_count != ext4_blocks_count(es)) { | ||
1043 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
1044 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1045 | ext4_journal_stop(handle); | ||
1046 | err = -EBUSY; | ||
1047 | goto exit_put; | ||
1048 | } | ||
1049 | |||
1050 | if ((err = ext4_journal_get_write_access(handle, | 1055 | if ((err = ext4_journal_get_write_access(handle, |
1051 | EXT4_SB(sb)->s_sbh))) { | 1056 | EXT4_SB(sb)->s_sbh))) { |
1052 | ext4_warning(sb, "error %d on journal write access", err); | 1057 | ext4_warning(sb, "error %d on journal write access", err); |
1053 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1054 | ext4_journal_stop(handle); | 1058 | ext4_journal_stop(handle); |
1055 | goto exit_put; | 1059 | goto exit_put; |
1056 | } | 1060 | } |
1057 | ext4_blocks_count_set(es, o_blocks_count + add); | 1061 | ext4_blocks_count_set(es, o_blocks_count + add); |
1058 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1059 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, | 1062 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, |
1060 | o_blocks_count + add); | 1063 | o_blocks_count + add); |
1061 | /* We add the blocks to the bitmap and set the group need init bit */ | 1064 | /* We add the blocks to the bitmap and set the group need init bit */ |
1062 | ext4_add_groupblocks(handle, sb, o_blocks_count, add); | 1065 | err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); |
1063 | ext4_handle_dirty_super(handle, sb); | 1066 | ext4_handle_dirty_super(handle, sb); |
1064 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | 1067 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, |
1065 | o_blocks_count + add); | 1068 | o_blocks_count + add); |
1066 | if ((err = ext4_journal_stop(handle))) | 1069 | err2 = ext4_journal_stop(handle); |
1070 | if (!err && err2) | ||
1071 | err = err2; | ||
1072 | |||
1073 | if (err) | ||
1067 | goto exit_put; | 1074 | goto exit_put; |
1068 | 1075 | ||
1069 | if (test_opt(sb, DEBUG)) | 1076 | if (test_opt(sb, DEBUG)) |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b3..4687fea0c00f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { | |||
110 | #define IS_EXT3_SB(sb) (0) | 110 | #define IS_EXT3_SB(sb) (0) |
111 | #endif | 111 | #endif |
112 | 112 | ||
113 | void *ext4_kvmalloc(size_t size, gfp_t flags) | ||
114 | { | ||
115 | void *ret; | ||
116 | |||
117 | ret = kmalloc(size, flags); | ||
118 | if (!ret) | ||
119 | ret = __vmalloc(size, flags, PAGE_KERNEL); | ||
120 | return ret; | ||
121 | } | ||
122 | |||
123 | void *ext4_kvzalloc(size_t size, gfp_t flags) | ||
124 | { | ||
125 | void *ret; | ||
126 | |||
127 | ret = kzalloc(size, flags); | ||
128 | if (!ret) | ||
129 | ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); | ||
130 | return ret; | ||
131 | } | ||
132 | |||
133 | void ext4_kvfree(void *ptr) | ||
134 | { | ||
135 | if (is_vmalloc_addr(ptr)) | ||
136 | vfree(ptr); | ||
137 | else | ||
138 | kfree(ptr); | ||
139 | |||
140 | } | ||
141 | |||
113 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, | 142 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, |
114 | struct ext4_group_desc *bg) | 143 | struct ext4_group_desc *bg) |
115 | { | 144 | { |
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) | |||
269 | journal_t *journal; | 298 | journal_t *journal; |
270 | handle_t *handle; | 299 | handle_t *handle; |
271 | 300 | ||
301 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); | ||
272 | if (sb->s_flags & MS_RDONLY) | 302 | if (sb->s_flags & MS_RDONLY) |
273 | return ERR_PTR(-EROFS); | 303 | return ERR_PTR(-EROFS); |
274 | 304 | ||
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb) | |||
789 | 819 | ||
790 | for (i = 0; i < sbi->s_gdb_count; i++) | 820 | for (i = 0; i < sbi->s_gdb_count; i++) |
791 | brelse(sbi->s_group_desc[i]); | 821 | brelse(sbi->s_group_desc[i]); |
792 | kfree(sbi->s_group_desc); | 822 | ext4_kvfree(sbi->s_group_desc); |
793 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 823 | ext4_kvfree(sbi->s_flex_groups); |
794 | vfree(sbi->s_flex_groups); | ||
795 | else | ||
796 | kfree(sbi->s_flex_groups); | ||
797 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 824 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
798 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 825 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
799 | percpu_counter_destroy(&sbi->s_dirs_counter); | 826 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -1976,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1976 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << | 2003 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << |
1977 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; | 2004 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; |
1978 | size = flex_group_count * sizeof(struct flex_groups); | 2005 | size = flex_group_count * sizeof(struct flex_groups); |
1979 | sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); | 2006 | sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); |
1980 | if (sbi->s_flex_groups == NULL) { | 2007 | if (sbi->s_flex_groups == NULL) { |
1981 | sbi->s_flex_groups = vzalloc(size); | 2008 | ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", |
1982 | if (sbi->s_flex_groups == NULL) { | 2009 | flex_group_count); |
1983 | ext4_msg(sb, KERN_ERR, | 2010 | goto failed; |
1984 | "not enough memory for %u flex groups", | ||
1985 | flex_group_count); | ||
1986 | goto failed; | ||
1987 | } | ||
1988 | } | 2011 | } |
1989 | 2012 | ||
1990 | for (i = 0; i < sbi->s_groups_count; i++) { | 2013 | for (i = 0; i < sbi->s_groups_count; i++) { |
@@ -2383,17 +2406,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | |||
2383 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); | 2406 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); |
2384 | unsigned long stripe_width = | 2407 | unsigned long stripe_width = |
2385 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); | 2408 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); |
2409 | int ret; | ||
2386 | 2410 | ||
2387 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) | 2411 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) |
2388 | return sbi->s_stripe; | 2412 | ret = sbi->s_stripe; |
2389 | 2413 | else if (stripe_width <= sbi->s_blocks_per_group) | |
2390 | if (stripe_width <= sbi->s_blocks_per_group) | 2414 | ret = stripe_width; |
2391 | return stripe_width; | 2415 | else if (stride <= sbi->s_blocks_per_group) |
2416 | ret = stride; | ||
2417 | else | ||
2418 | ret = 0; | ||
2392 | 2419 | ||
2393 | if (stride <= sbi->s_blocks_per_group) | 2420 | /* |
2394 | return stride; | 2421 | * If the stripe width is 1, this makes no sense and |
2422 | * we set it to 0 to turn off stripe handling code. | ||
2423 | */ | ||
2424 | if (ret <= 1) | ||
2425 | ret = 0; | ||
2395 | 2426 | ||
2396 | return 0; | 2427 | return ret; |
2397 | } | 2428 | } |
2398 | 2429 | ||
2399 | /* sysfs supprt */ | 2430 | /* sysfs supprt */ |
@@ -3408,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3408 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); | 3439 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); |
3409 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | 3440 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / |
3410 | EXT4_DESC_PER_BLOCK(sb); | 3441 | EXT4_DESC_PER_BLOCK(sb); |
3411 | sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), | 3442 | sbi->s_group_desc = ext4_kvmalloc(db_count * |
3412 | GFP_KERNEL); | 3443 | sizeof(struct buffer_head *), |
3444 | GFP_KERNEL); | ||
3413 | if (sbi->s_group_desc == NULL) { | 3445 | if (sbi->s_group_desc == NULL) { |
3414 | ext4_msg(sb, KERN_ERR, "not enough memory"); | 3446 | ext4_msg(sb, KERN_ERR, "not enough memory"); |
3415 | goto failed_mount; | 3447 | goto failed_mount; |
@@ -3491,7 +3523,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3491 | 3523 | ||
3492 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ | 3524 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ |
3493 | mutex_init(&sbi->s_orphan_lock); | 3525 | mutex_init(&sbi->s_orphan_lock); |
3494 | mutex_init(&sbi->s_resize_lock); | 3526 | sbi->s_resize_flags = 0; |
3495 | 3527 | ||
3496 | sb->s_root = NULL; | 3528 | sb->s_root = NULL; |
3497 | 3529 | ||
@@ -3741,12 +3773,8 @@ failed_mount_wq: | |||
3741 | } | 3773 | } |
3742 | failed_mount3: | 3774 | failed_mount3: |
3743 | del_timer(&sbi->s_err_report); | 3775 | del_timer(&sbi->s_err_report); |
3744 | if (sbi->s_flex_groups) { | 3776 | if (sbi->s_flex_groups) |
3745 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 3777 | ext4_kvfree(sbi->s_flex_groups); |
3746 | vfree(sbi->s_flex_groups); | ||
3747 | else | ||
3748 | kfree(sbi->s_flex_groups); | ||
3749 | } | ||
3750 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 3778 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
3751 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 3779 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
3752 | percpu_counter_destroy(&sbi->s_dirs_counter); | 3780 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -3756,7 +3784,7 @@ failed_mount3: | |||
3756 | failed_mount2: | 3784 | failed_mount2: |
3757 | for (i = 0; i < db_count; i++) | 3785 | for (i = 0; i < db_count; i++) |
3758 | brelse(sbi->s_group_desc[i]); | 3786 | brelse(sbi->s_group_desc[i]); |
3759 | kfree(sbi->s_group_desc); | 3787 | ext4_kvfree(sbi->s_group_desc); |
3760 | failed_mount: | 3788 | failed_mount: |
3761 | if (sbi->s_proc) { | 3789 | if (sbi->s_proc) { |
3762 | remove_proc_entry(sb->s_id, ext4_proc_root); | 3790 | remove_proc_entry(sb->s_id, ext4_proc_root); |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 000000000000..011ba6670d99 --- /dev/null +++ b/fs/ext4/truncate.h | |||
@@ -0,0 +1,43 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/truncate.h | ||
3 | * | ||
4 | * Common inline functions needed for truncate support | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Truncate blocks that were not used by write. We have to truncate the | ||
9 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
10 | */ | ||
11 | static inline void ext4_truncate_failed_write(struct inode *inode) | ||
12 | { | ||
13 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
14 | ext4_truncate(inode); | ||
15 | } | ||
16 | |||
17 | /* | ||
18 | * Work out how many blocks we need to proceed with the next chunk of a | ||
19 | * truncate transaction. | ||
20 | */ | ||
21 | static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) | ||
22 | { | ||
23 | ext4_lblk_t needed; | ||
24 | |||
25 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
26 | |||
27 | /* Give ourselves just enough room to cope with inodes in which | ||
28 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
29 | * which resulted in random data in an inode which looked enough | ||
30 | * like a regular file for ext4 to try to delete it. Things | ||
31 | * will go a bit crazy if that happens, but at least we should | ||
32 | * try not to panic the whole kernel. */ | ||
33 | if (needed < 2) | ||
34 | needed = 2; | ||
35 | |||
36 | /* But we need to bound the transaction so we don't overflow the | ||
37 | * journal. */ | ||
38 | if (needed > EXT4_MAX_TRANS_DATA) | ||
39 | needed = EXT4_MAX_TRANS_DATA; | ||
40 | |||
41 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
42 | } | ||
43 | |||