diff options
author | David S. Miller <davem@davemloft.net> | 2011-08-08 02:20:26 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-08-08 02:20:26 -0400 |
commit | 19fd61785a580c60cba900c5171bfadb57dd5056 (patch) | |
tree | 1e491fb014be0dc03f4b6755bb94e73afd38c455 /fs | |
parent | 57569d0e12eaf31717e295960cd2a26f626c8e5b (diff) | |
parent | 8028837d71ba9904b17281b40f94b93e947fbe38 (diff) |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net
Diffstat (limited to 'fs')
90 files changed, 3509 insertions, 3549 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index e9cb57f07546..9a1d42630751 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c | |||
@@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry, | |||
182 | return 0; | 182 | return 0; |
183 | } | 183 | } |
184 | 184 | ||
185 | int v9fs_acl_mode(struct inode *dir, mode_t *modep, | 185 | int v9fs_acl_mode(struct inode *dir, umode_t *modep, |
186 | struct posix_acl **dpacl, struct posix_acl **pacl) | 186 | struct posix_acl **dpacl, struct posix_acl **pacl) |
187 | { | 187 | { |
188 | int retval = 0; | 188 | int retval = 0; |
189 | mode_t mode = *modep; | 189 | umode_t mode = *modep; |
190 | struct posix_acl *acl = NULL; | 190 | struct posix_acl *acl = NULL; |
191 | 191 | ||
192 | if (!S_ISLNK(mode)) { | 192 | if (!S_ISLNK(mode)) { |
@@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, | |||
319 | case ACL_TYPE_ACCESS: | 319 | case ACL_TYPE_ACCESS: |
320 | name = POSIX_ACL_XATTR_ACCESS; | 320 | name = POSIX_ACL_XATTR_ACCESS; |
321 | if (acl) { | 321 | if (acl) { |
322 | mode_t mode = inode->i_mode; | 322 | umode_t mode = inode->i_mode; |
323 | retval = posix_acl_equiv_mode(acl, &mode); | 323 | retval = posix_acl_equiv_mode(acl, &mode); |
324 | if (retval < 0) | 324 | if (retval < 0) |
325 | goto err_out; | 325 | goto err_out; |
diff --git a/fs/9p/acl.h b/fs/9p/acl.h index ddb7ae19d971..559556411965 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h | |||
@@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); | |||
20 | extern int v9fs_acl_chmod(struct dentry *); | 20 | extern int v9fs_acl_chmod(struct dentry *); |
21 | extern int v9fs_set_create_acl(struct dentry *, | 21 | extern int v9fs_set_create_acl(struct dentry *, |
22 | struct posix_acl **, struct posix_acl **); | 22 | struct posix_acl **, struct posix_acl **); |
23 | extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, | 23 | extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, |
24 | struct posix_acl **dpacl, struct posix_acl **pacl); | 24 | struct posix_acl **dpacl, struct posix_acl **pacl); |
25 | #else | 25 | #else |
26 | #define v9fs_iop_get_acl NULL | 26 | #define v9fs_iop_get_acl NULL |
@@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry, | |||
38 | { | 38 | { |
39 | return 0; | 39 | return 0; |
40 | } | 40 | } |
41 | static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, | 41 | static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep, |
42 | struct posix_acl **dpacl, | 42 | struct posix_acl **dpacl, |
43 | struct posix_acl **pacl) | 43 | struct posix_acl **pacl) |
44 | { | 44 | { |
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 9a26dce5a99f..b6c8ed205192 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c | |||
@@ -206,7 +206,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, | |||
206 | int err = 0; | 206 | int err = 0; |
207 | gid_t gid; | 207 | gid_t gid; |
208 | int flags; | 208 | int flags; |
209 | mode_t mode; | 209 | umode_t mode; |
210 | char *name = NULL; | 210 | char *name = NULL; |
211 | struct file *filp; | 211 | struct file *filp; |
212 | struct p9_qid qid; | 212 | struct p9_qid qid; |
@@ -348,7 +348,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, | |||
348 | struct p9_fid *fid = NULL, *dfid = NULL; | 348 | struct p9_fid *fid = NULL, *dfid = NULL; |
349 | gid_t gid; | 349 | gid_t gid; |
350 | char *name; | 350 | char *name; |
351 | mode_t mode; | 351 | umode_t mode; |
352 | struct inode *inode; | 352 | struct inode *inode; |
353 | struct p9_qid qid; | 353 | struct p9_qid qid; |
354 | struct dentry *dir_dentry; | 354 | struct dentry *dir_dentry; |
@@ -751,7 +751,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode, | |||
751 | int err; | 751 | int err; |
752 | gid_t gid; | 752 | gid_t gid; |
753 | char *name; | 753 | char *name; |
754 | mode_t mode; | 754 | umode_t mode; |
755 | struct v9fs_session_info *v9ses; | 755 | struct v9fs_session_info *v9ses; |
756 | struct p9_fid *fid = NULL, *dfid = NULL; | 756 | struct p9_fid *fid = NULL, *dfid = NULL; |
757 | struct inode *inode; | 757 | struct inode *inode; |
diff --git a/fs/Kconfig b/fs/Kconfig index 19891aab9c6e..9fe0b349f4cd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL | |||
127 | select TMPFS_XATTR | 127 | select TMPFS_XATTR |
128 | select GENERIC_ACL | 128 | select GENERIC_ACL |
129 | help | 129 | help |
130 | POSIX Access Control Lists (ACLs) support permissions for users and | 130 | POSIX Access Control Lists (ACLs) support additional access rights |
131 | groups beyond the owner/group/world scheme. | 131 | for users and groups beyond the standard owner/group/world scheme, |
132 | and this option selects support for ACLs specifically for tmpfs | ||
133 | filesystems. | ||
134 | |||
135 | If you've selected TMPFS, it's possible that you'll also need | ||
136 | this option as there are a number of Linux distros that require | ||
137 | POSIX ACL support under /dev for certain features to work properly. | ||
138 | For example, some distros need this feature for ALSA-related /dev | ||
139 | files for sound to work properly. In short, if you're not sure, | ||
140 | say Y. | ||
132 | 141 | ||
133 | To learn more about Access Control Lists, visit the POSIX ACLs for | 142 | To learn more about Access Control Lists, visit the POSIX ACLs for |
134 | Linux website <http://acl.bestbits.at/>. | 143 | Linux website <http://acl.bestbits.at/>. |
135 | 144 | ||
136 | If you don't know what Access Control Lists are, say N. | ||
137 | |||
138 | config TMPFS_XATTR | 145 | config TMPFS_XATTR |
139 | bool "Tmpfs extended attributes" | 146 | bool "Tmpfs extended attributes" |
140 | depends on TMPFS | 147 | depends on TMPFS |
diff --git a/fs/block_dev.c b/fs/block_dev.c index f55aad4d1611..ff77262e887c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -387,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) | |||
387 | struct inode *bd_inode = filp->f_mapping->host; | 387 | struct inode *bd_inode = filp->f_mapping->host; |
388 | struct block_device *bdev = I_BDEV(bd_inode); | 388 | struct block_device *bdev = I_BDEV(bd_inode); |
389 | int error; | 389 | int error; |
390 | |||
391 | error = filemap_write_and_wait_range(filp->f_mapping, start, end); | ||
392 | if (error) | ||
393 | return error; | ||
390 | 394 | ||
391 | /* | 395 | /* |
392 | * There is no need to serialise calls to blkdev_issue_flush with | 396 | * There is no need to serialise calls to blkdev_issue_flush with |
@@ -552,6 +556,7 @@ struct block_device *bdget(dev_t dev) | |||
552 | 556 | ||
553 | if (inode->i_state & I_NEW) { | 557 | if (inode->i_state & I_NEW) { |
554 | bdev->bd_contains = NULL; | 558 | bdev->bd_contains = NULL; |
559 | bdev->bd_super = NULL; | ||
555 | bdev->bd_inode = inode; | 560 | bdev->bd_inode = inode; |
556 | bdev->bd_block_size = (1 << inode->i_blkbits); | 561 | bdev->bd_block_size = (1 << inode->i_blkbits); |
557 | bdev->bd_part_count = 0; | 562 | bdev->bd_part_count = 0; |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 9b72dcf1cd25..40e6ac08c21f 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
@@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
6 | transaction.o inode.o file.o tree-defrag.o \ | 6 | transaction.o inode.o file.o tree-defrag.o \ |
7 | extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ | 7 | extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ |
8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
9 | export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o |
11 | |||
12 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | ||
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 65a735d8f6e4..eb159aaa5a11 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c | |||
@@ -28,8 +28,6 @@ | |||
28 | #include "btrfs_inode.h" | 28 | #include "btrfs_inode.h" |
29 | #include "xattr.h" | 29 | #include "xattr.h" |
30 | 30 | ||
31 | #ifdef CONFIG_BTRFS_FS_POSIX_ACL | ||
32 | |||
33 | struct posix_acl *btrfs_get_acl(struct inode *inode, int type) | 31 | struct posix_acl *btrfs_get_acl(struct inode *inode, int type) |
34 | { | 32 | { |
35 | int size; | 33 | int size; |
@@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, | |||
111 | int ret, size = 0; | 109 | int ret, size = 0; |
112 | const char *name; | 110 | const char *name; |
113 | char *value = NULL; | 111 | char *value = NULL; |
114 | mode_t mode; | ||
115 | 112 | ||
116 | if (acl) { | 113 | if (acl) { |
117 | ret = posix_acl_valid(acl); | 114 | ret = posix_acl_valid(acl); |
@@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans, | |||
122 | 119 | ||
123 | switch (type) { | 120 | switch (type) { |
124 | case ACL_TYPE_ACCESS: | 121 | case ACL_TYPE_ACCESS: |
125 | mode = inode->i_mode; | ||
126 | name = POSIX_ACL_XATTR_ACCESS; | 122 | name = POSIX_ACL_XATTR_ACCESS; |
127 | if (acl) { | 123 | if (acl) { |
128 | ret = posix_acl_equiv_mode(acl, &mode); | 124 | ret = posix_acl_equiv_mode(acl, &inode->i_mode); |
129 | if (ret < 0) | 125 | if (ret < 0) |
130 | return ret; | 126 | return ret; |
131 | inode->i_mode = mode; | ||
132 | } | 127 | } |
133 | ret = 0; | 128 | ret = 0; |
134 | break; | 129 | break; |
@@ -222,19 +217,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, | |||
222 | } | 217 | } |
223 | 218 | ||
224 | if (IS_POSIXACL(dir) && acl) { | 219 | if (IS_POSIXACL(dir) && acl) { |
225 | mode_t mode = inode->i_mode; | ||
226 | |||
227 | if (S_ISDIR(inode->i_mode)) { | 220 | if (S_ISDIR(inode->i_mode)) { |
228 | ret = btrfs_set_acl(trans, inode, acl, | 221 | ret = btrfs_set_acl(trans, inode, acl, |
229 | ACL_TYPE_DEFAULT); | 222 | ACL_TYPE_DEFAULT); |
230 | if (ret) | 223 | if (ret) |
231 | goto failed; | 224 | goto failed; |
232 | } | 225 | } |
233 | ret = posix_acl_create(&acl, GFP_NOFS, &mode); | 226 | ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
234 | if (ret < 0) | 227 | if (ret < 0) |
235 | return ret; | 228 | return ret; |
236 | 229 | ||
237 | inode->i_mode = mode; | ||
238 | if (ret > 0) { | 230 | if (ret > 0) { |
239 | /* we need an acl */ | 231 | /* we need an acl */ |
240 | ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); | 232 | ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); |
@@ -282,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = { | |||
282 | .get = btrfs_xattr_acl_get, | 274 | .get = btrfs_xattr_acl_get, |
283 | .set = btrfs_xattr_acl_set, | 275 | .set = btrfs_xattr_acl_set, |
284 | }; | 276 | }; |
285 | |||
286 | #else /* CONFIG_BTRFS_FS_POSIX_ACL */ | ||
287 | |||
288 | int btrfs_acl_chmod(struct inode *inode) | ||
289 | { | ||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | int btrfs_init_acl(struct btrfs_trans_handle *trans, | ||
294 | struct inode *inode, struct inode *dir) | ||
295 | { | ||
296 | return 0; | ||
297 | } | ||
298 | |||
299 | #endif /* CONFIG_BTRFS_FS_POSIX_ACL */ | ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index bfe42b03eaf9..8ec5d86f1734 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
@@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
338 | u64 first_byte = disk_start; | 338 | u64 first_byte = disk_start; |
339 | struct block_device *bdev; | 339 | struct block_device *bdev; |
340 | int ret; | 340 | int ret; |
341 | int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; | ||
341 | 342 | ||
342 | WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); | 343 | WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); |
343 | cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); | 344 | cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); |
@@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
392 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); | 393 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); |
393 | BUG_ON(ret); | 394 | BUG_ON(ret); |
394 | 395 | ||
395 | ret = btrfs_csum_one_bio(root, inode, bio, start, 1); | 396 | if (!skip_sum) { |
396 | BUG_ON(ret); | 397 | ret = btrfs_csum_one_bio(root, inode, bio, |
398 | start, 1); | ||
399 | BUG_ON(ret); | ||
400 | } | ||
397 | 401 | ||
398 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); | 402 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); |
399 | BUG_ON(ret); | 403 | BUG_ON(ret); |
@@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
418 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); | 422 | ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); |
419 | BUG_ON(ret); | 423 | BUG_ON(ret); |
420 | 424 | ||
421 | ret = btrfs_csum_one_bio(root, inode, bio, start, 1); | 425 | if (!skip_sum) { |
422 | BUG_ON(ret); | 426 | ret = btrfs_csum_one_bio(root, inode, bio, start, 1); |
427 | BUG_ON(ret); | ||
428 | } | ||
423 | 429 | ||
424 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); | 430 | ret = btrfs_map_bio(root, WRITE, bio, 0, 1); |
425 | BUG_ON(ret); | 431 | BUG_ON(ret); |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 365c4e1dde04..0469263e327e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
@@ -2406,8 +2406,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct | |||
2406 | btrfs_root_item *item, struct btrfs_key *key); | 2406 | btrfs_root_item *item, struct btrfs_key *key); |
2407 | int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); | 2407 | int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); |
2408 | int btrfs_find_orphan_roots(struct btrfs_root *tree_root); | 2408 | int btrfs_find_orphan_roots(struct btrfs_root *tree_root); |
2409 | int btrfs_set_root_node(struct btrfs_root_item *item, | 2409 | void btrfs_set_root_node(struct btrfs_root_item *item, |
2410 | struct extent_buffer *node); | 2410 | struct extent_buffer *node); |
2411 | void btrfs_check_and_init_root_item(struct btrfs_root_item *item); | 2411 | void btrfs_check_and_init_root_item(struct btrfs_root_item *item); |
2412 | 2412 | ||
2413 | /* dir-item.c */ | 2413 | /* dir-item.c */ |
@@ -2523,6 +2523,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag | |||
2523 | #define PageChecked PageFsMisc | 2523 | #define PageChecked PageFsMisc |
2524 | #endif | 2524 | #endif |
2525 | 2525 | ||
2526 | /* This forces readahead on a given range of bytes in an inode */ | ||
2527 | static inline void btrfs_force_ra(struct address_space *mapping, | ||
2528 | struct file_ra_state *ra, struct file *file, | ||
2529 | pgoff_t offset, unsigned long req_size) | ||
2530 | { | ||
2531 | page_cache_sync_readahead(mapping, ra, file, offset, req_size); | ||
2532 | } | ||
2533 | |||
2526 | struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); | 2534 | struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); |
2527 | int btrfs_set_inode_index(struct inode *dir, u64 *index); | 2535 | int btrfs_set_inode_index(struct inode *dir, u64 *index); |
2528 | int btrfs_unlink_inode(struct btrfs_trans_handle *trans, | 2536 | int btrfs_unlink_inode(struct btrfs_trans_handle *trans, |
@@ -2551,9 +2559,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | |||
2551 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 2559 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, |
2552 | size_t size, struct bio *bio, unsigned long bio_flags); | 2560 | size_t size, struct bio *bio, unsigned long bio_flags); |
2553 | 2561 | ||
2554 | unsigned long btrfs_force_ra(struct address_space *mapping, | ||
2555 | struct file_ra_state *ra, struct file *file, | ||
2556 | pgoff_t offset, pgoff_t last_index); | ||
2557 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2562 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
2558 | int btrfs_readpage(struct file *file, struct page *page); | 2563 | int btrfs_readpage(struct file *file, struct page *page); |
2559 | void btrfs_evict_inode(struct inode *inode); | 2564 | void btrfs_evict_inode(struct inode *inode); |
@@ -2648,12 +2653,21 @@ do { \ | |||
2648 | /* acl.c */ | 2653 | /* acl.c */ |
2649 | #ifdef CONFIG_BTRFS_FS_POSIX_ACL | 2654 | #ifdef CONFIG_BTRFS_FS_POSIX_ACL |
2650 | struct posix_acl *btrfs_get_acl(struct inode *inode, int type); | 2655 | struct posix_acl *btrfs_get_acl(struct inode *inode, int type); |
2651 | #else | ||
2652 | #define btrfs_get_acl NULL | ||
2653 | #endif | ||
2654 | int btrfs_init_acl(struct btrfs_trans_handle *trans, | 2656 | int btrfs_init_acl(struct btrfs_trans_handle *trans, |
2655 | struct inode *inode, struct inode *dir); | 2657 | struct inode *inode, struct inode *dir); |
2656 | int btrfs_acl_chmod(struct inode *inode); | 2658 | int btrfs_acl_chmod(struct inode *inode); |
2659 | #else | ||
2660 | #define btrfs_get_acl NULL | ||
2661 | static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, | ||
2662 | struct inode *inode, struct inode *dir) | ||
2663 | { | ||
2664 | return 0; | ||
2665 | } | ||
2666 | static inline int btrfs_acl_chmod(struct inode *inode) | ||
2667 | { | ||
2668 | return 0; | ||
2669 | } | ||
2670 | #endif | ||
2657 | 2671 | ||
2658 | /* relocation.c */ | 2672 | /* relocation.c */ |
2659 | int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); | 2673 | int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); |
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index c360a848d97f..31d84e78129b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c | |||
@@ -198,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, | |||
198 | struct btrfs_key key; | 198 | struct btrfs_key key; |
199 | int ins_len = mod < 0 ? -1 : 0; | 199 | int ins_len = mod < 0 ? -1 : 0; |
200 | int cow = mod != 0; | 200 | int cow = mod != 0; |
201 | struct btrfs_key found_key; | ||
202 | struct extent_buffer *leaf; | ||
203 | 201 | ||
204 | key.objectid = dir; | 202 | key.objectid = dir; |
205 | btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); | 203 | btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); |
@@ -209,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, | |||
209 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); | 207 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); |
210 | if (ret < 0) | 208 | if (ret < 0) |
211 | return ERR_PTR(ret); | 209 | return ERR_PTR(ret); |
212 | if (ret > 0) { | 210 | if (ret > 0) |
213 | if (path->slots[0] == 0) | ||
214 | return NULL; | ||
215 | path->slots[0]--; | ||
216 | } | ||
217 | |||
218 | leaf = path->nodes[0]; | ||
219 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
220 | |||
221 | if (found_key.objectid != dir || | ||
222 | btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || | ||
223 | found_key.offset != key.offset) | ||
224 | return NULL; | 211 | return NULL; |
225 | 212 | ||
226 | return btrfs_match_dir_item_name(root, path, name, name_len); | 213 | return btrfs_match_dir_item_name(root, path, name, name_len); |
@@ -315,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
315 | struct btrfs_key key; | 302 | struct btrfs_key key; |
316 | int ins_len = mod < 0 ? -1 : 0; | 303 | int ins_len = mod < 0 ? -1 : 0; |
317 | int cow = mod != 0; | 304 | int cow = mod != 0; |
318 | struct btrfs_key found_key; | ||
319 | struct extent_buffer *leaf; | ||
320 | 305 | ||
321 | key.objectid = dir; | 306 | key.objectid = dir; |
322 | btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); | 307 | btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); |
@@ -324,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, | |||
324 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); | 309 | ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); |
325 | if (ret < 0) | 310 | if (ret < 0) |
326 | return ERR_PTR(ret); | 311 | return ERR_PTR(ret); |
327 | if (ret > 0) { | 312 | if (ret > 0) |
328 | if (path->slots[0] == 0) | ||
329 | return NULL; | ||
330 | path->slots[0]--; | ||
331 | } | ||
332 | |||
333 | leaf = path->nodes[0]; | ||
334 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
335 | |||
336 | if (found_key.objectid != dir || | ||
337 | btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || | ||
338 | found_key.offset != key.offset) | ||
339 | return NULL; | 313 | return NULL; |
340 | 314 | ||
341 | return btrfs_match_dir_item_name(root, path, name, name_len); | 315 | return btrfs_match_dir_item_name(root, path, name, name_len); |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4d08ed79405d..66bac226944e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -663,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) | |||
663 | struct btrfs_path *path; | 663 | struct btrfs_path *path; |
664 | 664 | ||
665 | path = btrfs_alloc_path(); | 665 | path = btrfs_alloc_path(); |
666 | BUG_ON(!path); | 666 | if (!path) |
667 | return -ENOMEM; | ||
668 | |||
667 | key.objectid = start; | 669 | key.objectid = start; |
668 | key.offset = len; | 670 | key.offset = len; |
669 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); | 671 | btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); |
@@ -3272,6 +3274,9 @@ again: | |||
3272 | } | 3274 | } |
3273 | 3275 | ||
3274 | ret = btrfs_alloc_chunk(trans, extent_root, flags); | 3276 | ret = btrfs_alloc_chunk(trans, extent_root, flags); |
3277 | if (ret < 0 && ret != -ENOSPC) | ||
3278 | goto out; | ||
3279 | |||
3275 | spin_lock(&space_info->lock); | 3280 | spin_lock(&space_info->lock); |
3276 | if (ret) | 3281 | if (ret) |
3277 | space_info->full = 1; | 3282 | space_info->full = 1; |
@@ -3281,6 +3286,7 @@ again: | |||
3281 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; | 3286 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; |
3282 | space_info->chunk_alloc = 0; | 3287 | space_info->chunk_alloc = 0; |
3283 | spin_unlock(&space_info->lock); | 3288 | spin_unlock(&space_info->lock); |
3289 | out: | ||
3284 | mutex_unlock(&extent_root->fs_info->chunk_mutex); | 3290 | mutex_unlock(&extent_root->fs_info->chunk_mutex); |
3285 | return ret; | 3291 | return ret; |
3286 | } | 3292 | } |
@@ -4456,7 +4462,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
4456 | printk(KERN_ERR "umm, got %d back from search" | 4462 | printk(KERN_ERR "umm, got %d back from search" |
4457 | ", was looking for %llu\n", ret, | 4463 | ", was looking for %llu\n", ret, |
4458 | (unsigned long long)bytenr); | 4464 | (unsigned long long)bytenr); |
4459 | btrfs_print_leaf(extent_root, path->nodes[0]); | 4465 | if (ret > 0) |
4466 | btrfs_print_leaf(extent_root, | ||
4467 | path->nodes[0]); | ||
4460 | } | 4468 | } |
4461 | BUG_ON(ret); | 4469 | BUG_ON(ret); |
4462 | extent_slot = path->slots[0]; | 4470 | extent_slot = path->slots[0]; |
@@ -5073,7 +5081,9 @@ have_block_group: | |||
5073 | * group is does point to and try again | 5081 | * group is does point to and try again |
5074 | */ | 5082 | */ |
5075 | if (!last_ptr_loop && last_ptr->block_group && | 5083 | if (!last_ptr_loop && last_ptr->block_group && |
5076 | last_ptr->block_group != block_group) { | 5084 | last_ptr->block_group != block_group && |
5085 | index <= | ||
5086 | get_block_group_index(last_ptr->block_group)) { | ||
5077 | 5087 | ||
5078 | btrfs_put_block_group(block_group); | 5088 | btrfs_put_block_group(block_group); |
5079 | block_group = last_ptr->block_group; | 5089 | block_group = last_ptr->block_group; |
@@ -5501,7 +5511,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, | |||
5501 | u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); | 5511 | u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); |
5502 | 5512 | ||
5503 | path = btrfs_alloc_path(); | 5513 | path = btrfs_alloc_path(); |
5504 | BUG_ON(!path); | 5514 | if (!path) |
5515 | return -ENOMEM; | ||
5505 | 5516 | ||
5506 | path->leave_spinning = 1; | 5517 | path->leave_spinning = 1; |
5507 | ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, | 5518 | ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, |
@@ -6272,10 +6283,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, | |||
6272 | int level; | 6283 | int level; |
6273 | 6284 | ||
6274 | path = btrfs_alloc_path(); | 6285 | path = btrfs_alloc_path(); |
6275 | BUG_ON(!path); | 6286 | if (!path) |
6287 | return -ENOMEM; | ||
6276 | 6288 | ||
6277 | wc = kzalloc(sizeof(*wc), GFP_NOFS); | 6289 | wc = kzalloc(sizeof(*wc), GFP_NOFS); |
6278 | BUG_ON(!wc); | 6290 | if (!wc) { |
6291 | btrfs_free_path(path); | ||
6292 | return -ENOMEM; | ||
6293 | } | ||
6279 | 6294 | ||
6280 | trans = btrfs_start_transaction(tree_root, 0); | 6295 | trans = btrfs_start_transaction(tree_root, 0); |
6281 | BUG_ON(IS_ERR(trans)); | 6296 | BUG_ON(IS_ERR(trans)); |
@@ -6538,8 +6553,6 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) | |||
6538 | u64 min_allocable_bytes; | 6553 | u64 min_allocable_bytes; |
6539 | int ret = -ENOSPC; | 6554 | int ret = -ENOSPC; |
6540 | 6555 | ||
6541 | if (cache->ro) | ||
6542 | return 0; | ||
6543 | 6556 | ||
6544 | /* | 6557 | /* |
6545 | * We need some metadata space and system metadata space for | 6558 | * We need some metadata space and system metadata space for |
@@ -6555,6 +6568,12 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) | |||
6555 | 6568 | ||
6556 | spin_lock(&sinfo->lock); | 6569 | spin_lock(&sinfo->lock); |
6557 | spin_lock(&cache->lock); | 6570 | spin_lock(&cache->lock); |
6571 | |||
6572 | if (cache->ro) { | ||
6573 | ret = 0; | ||
6574 | goto out; | ||
6575 | } | ||
6576 | |||
6558 | num_bytes = cache->key.offset - cache->reserved - cache->pinned - | 6577 | num_bytes = cache->key.offset - cache->reserved - cache->pinned - |
6559 | cache->bytes_super - btrfs_block_group_used(&cache->item); | 6578 | cache->bytes_super - btrfs_block_group_used(&cache->item); |
6560 | 6579 | ||
@@ -6568,7 +6587,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) | |||
6568 | cache->ro = 1; | 6587 | cache->ro = 1; |
6569 | ret = 0; | 6588 | ret = 0; |
6570 | } | 6589 | } |
6571 | 6590 | out: | |
6572 | spin_unlock(&cache->lock); | 6591 | spin_unlock(&cache->lock); |
6573 | spin_unlock(&sinfo->lock); | 6592 | spin_unlock(&sinfo->lock); |
6574 | return ret; | 6593 | return ret; |
@@ -7183,11 +7202,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
7183 | spin_unlock(&cluster->refill_lock); | 7202 | spin_unlock(&cluster->refill_lock); |
7184 | 7203 | ||
7185 | path = btrfs_alloc_path(); | 7204 | path = btrfs_alloc_path(); |
7186 | BUG_ON(!path); | 7205 | if (!path) { |
7206 | ret = -ENOMEM; | ||
7207 | goto out; | ||
7208 | } | ||
7187 | 7209 | ||
7188 | inode = lookup_free_space_inode(root, block_group, path); | 7210 | inode = lookup_free_space_inode(root, block_group, path); |
7189 | if (!IS_ERR(inode)) { | 7211 | if (!IS_ERR(inode)) { |
7190 | btrfs_orphan_add(trans, inode); | 7212 | ret = btrfs_orphan_add(trans, inode); |
7213 | BUG_ON(ret); | ||
7191 | clear_nlink(inode); | 7214 | clear_nlink(inode); |
7192 | /* One for the block groups ref */ | 7215 | /* One for the block groups ref */ |
7193 | spin_lock(&block_group->lock); | 7216 | spin_lock(&block_group->lock); |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 067b1747421b..d418164a35f1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, | |||
254 | * | 254 | * |
255 | * This should be called with the tree lock held. | 255 | * This should be called with the tree lock held. |
256 | */ | 256 | */ |
257 | static int merge_state(struct extent_io_tree *tree, | 257 | static void merge_state(struct extent_io_tree *tree, |
258 | struct extent_state *state) | 258 | struct extent_state *state) |
259 | { | 259 | { |
260 | struct extent_state *other; | 260 | struct extent_state *other; |
261 | struct rb_node *other_node; | 261 | struct rb_node *other_node; |
262 | 262 | ||
263 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) | 263 | if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) |
264 | return 0; | 264 | return; |
265 | 265 | ||
266 | other_node = rb_prev(&state->rb_node); | 266 | other_node = rb_prev(&state->rb_node); |
267 | if (other_node) { | 267 | if (other_node) { |
@@ -287,19 +287,13 @@ static int merge_state(struct extent_io_tree *tree, | |||
287 | free_extent_state(other); | 287 | free_extent_state(other); |
288 | } | 288 | } |
289 | } | 289 | } |
290 | |||
291 | return 0; | ||
292 | } | 290 | } |
293 | 291 | ||
294 | static int set_state_cb(struct extent_io_tree *tree, | 292 | static void set_state_cb(struct extent_io_tree *tree, |
295 | struct extent_state *state, int *bits) | 293 | struct extent_state *state, int *bits) |
296 | { | 294 | { |
297 | if (tree->ops && tree->ops->set_bit_hook) { | 295 | if (tree->ops && tree->ops->set_bit_hook) |
298 | return tree->ops->set_bit_hook(tree->mapping->host, | 296 | tree->ops->set_bit_hook(tree->mapping->host, state, bits); |
299 | state, bits); | ||
300 | } | ||
301 | |||
302 | return 0; | ||
303 | } | 297 | } |
304 | 298 | ||
305 | static void clear_state_cb(struct extent_io_tree *tree, | 299 | static void clear_state_cb(struct extent_io_tree *tree, |
@@ -309,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree, | |||
309 | tree->ops->clear_bit_hook(tree->mapping->host, state, bits); | 303 | tree->ops->clear_bit_hook(tree->mapping->host, state, bits); |
310 | } | 304 | } |
311 | 305 | ||
306 | static void set_state_bits(struct extent_io_tree *tree, | ||
307 | struct extent_state *state, int *bits); | ||
308 | |||
312 | /* | 309 | /* |
313 | * insert an extent_state struct into the tree. 'bits' are set on the | 310 | * insert an extent_state struct into the tree. 'bits' are set on the |
314 | * struct before it is inserted. | 311 | * struct before it is inserted. |
@@ -324,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree, | |||
324 | int *bits) | 321 | int *bits) |
325 | { | 322 | { |
326 | struct rb_node *node; | 323 | struct rb_node *node; |
327 | int bits_to_set = *bits & ~EXTENT_CTLBITS; | ||
328 | int ret; | ||
329 | 324 | ||
330 | if (end < start) { | 325 | if (end < start) { |
331 | printk(KERN_ERR "btrfs end < start %llu %llu\n", | 326 | printk(KERN_ERR "btrfs end < start %llu %llu\n", |
@@ -335,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree, | |||
335 | } | 330 | } |
336 | state->start = start; | 331 | state->start = start; |
337 | state->end = end; | 332 | state->end = end; |
338 | ret = set_state_cb(tree, state, bits); | ||
339 | if (ret) | ||
340 | return ret; | ||
341 | 333 | ||
342 | if (bits_to_set & EXTENT_DIRTY) | 334 | set_state_bits(tree, state, bits); |
343 | tree->dirty_bytes += end - start + 1; | 335 | |
344 | state->state |= bits_to_set; | ||
345 | node = tree_insert(&tree->state, end, &state->rb_node); | 336 | node = tree_insert(&tree->state, end, &state->rb_node); |
346 | if (node) { | 337 | if (node) { |
347 | struct extent_state *found; | 338 | struct extent_state *found; |
@@ -357,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree, | |||
357 | return 0; | 348 | return 0; |
358 | } | 349 | } |
359 | 350 | ||
360 | static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, | 351 | static void split_cb(struct extent_io_tree *tree, struct extent_state *orig, |
361 | u64 split) | 352 | u64 split) |
362 | { | 353 | { |
363 | if (tree->ops && tree->ops->split_extent_hook) | 354 | if (tree->ops && tree->ops->split_extent_hook) |
364 | return tree->ops->split_extent_hook(tree->mapping->host, | 355 | tree->ops->split_extent_hook(tree->mapping->host, orig, split); |
365 | orig, split); | ||
366 | return 0; | ||
367 | } | 356 | } |
368 | 357 | ||
369 | /* | 358 | /* |
@@ -659,34 +648,25 @@ again: | |||
659 | if (start > end) | 648 | if (start > end) |
660 | break; | 649 | break; |
661 | 650 | ||
662 | if (need_resched()) { | 651 | cond_resched_lock(&tree->lock); |
663 | spin_unlock(&tree->lock); | ||
664 | cond_resched(); | ||
665 | spin_lock(&tree->lock); | ||
666 | } | ||
667 | } | 652 | } |
668 | out: | 653 | out: |
669 | spin_unlock(&tree->lock); | 654 | spin_unlock(&tree->lock); |
670 | return 0; | 655 | return 0; |
671 | } | 656 | } |
672 | 657 | ||
673 | static int set_state_bits(struct extent_io_tree *tree, | 658 | static void set_state_bits(struct extent_io_tree *tree, |
674 | struct extent_state *state, | 659 | struct extent_state *state, |
675 | int *bits) | 660 | int *bits) |
676 | { | 661 | { |
677 | int ret; | ||
678 | int bits_to_set = *bits & ~EXTENT_CTLBITS; | 662 | int bits_to_set = *bits & ~EXTENT_CTLBITS; |
679 | 663 | ||
680 | ret = set_state_cb(tree, state, bits); | 664 | set_state_cb(tree, state, bits); |
681 | if (ret) | ||
682 | return ret; | ||
683 | if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { | 665 | if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { |
684 | u64 range = state->end - state->start + 1; | 666 | u64 range = state->end - state->start + 1; |
685 | tree->dirty_bytes += range; | 667 | tree->dirty_bytes += range; |
686 | } | 668 | } |
687 | state->state |= bits_to_set; | 669 | state->state |= bits_to_set; |
688 | |||
689 | return 0; | ||
690 | } | 670 | } |
691 | 671 | ||
692 | static void cache_state(struct extent_state *state, | 672 | static void cache_state(struct extent_state *state, |
@@ -779,9 +759,7 @@ hit_next: | |||
779 | goto out; | 759 | goto out; |
780 | } | 760 | } |
781 | 761 | ||
782 | err = set_state_bits(tree, state, &bits); | 762 | set_state_bits(tree, state, &bits); |
783 | if (err) | ||
784 | goto out; | ||
785 | 763 | ||
786 | cache_state(state, cached_state); | 764 | cache_state(state, cached_state); |
787 | merge_state(tree, state); | 765 | merge_state(tree, state); |
@@ -830,9 +808,7 @@ hit_next: | |||
830 | if (err) | 808 | if (err) |
831 | goto out; | 809 | goto out; |
832 | if (state->end <= end) { | 810 | if (state->end <= end) { |
833 | err = set_state_bits(tree, state, &bits); | 811 | set_state_bits(tree, state, &bits); |
834 | if (err) | ||
835 | goto out; | ||
836 | cache_state(state, cached_state); | 812 | cache_state(state, cached_state); |
837 | merge_state(tree, state); | 813 | merge_state(tree, state); |
838 | if (last_end == (u64)-1) | 814 | if (last_end == (u64)-1) |
@@ -893,11 +869,7 @@ hit_next: | |||
893 | err = split_state(tree, state, prealloc, end + 1); | 869 | err = split_state(tree, state, prealloc, end + 1); |
894 | BUG_ON(err == -EEXIST); | 870 | BUG_ON(err == -EEXIST); |
895 | 871 | ||
896 | err = set_state_bits(tree, prealloc, &bits); | 872 | set_state_bits(tree, prealloc, &bits); |
897 | if (err) { | ||
898 | prealloc = NULL; | ||
899 | goto out; | ||
900 | } | ||
901 | cache_state(prealloc, cached_state); | 873 | cache_state(prealloc, cached_state); |
902 | merge_state(tree, prealloc); | 874 | merge_state(tree, prealloc); |
903 | prealloc = NULL; | 875 | prealloc = NULL; |
@@ -1059,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) | |||
1059 | return 0; | 1031 | return 0; |
1060 | } | 1032 | } |
1061 | 1033 | ||
1062 | /* | ||
1063 | * find the first offset in the io tree with 'bits' set. zero is | ||
1064 | * returned if we find something, and *start_ret and *end_ret are | ||
1065 | * set to reflect the state struct that was found. | ||
1066 | * | ||
1067 | * If nothing was found, 1 is returned, < 0 on error | ||
1068 | */ | ||
1069 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | ||
1070 | u64 *start_ret, u64 *end_ret, int bits) | ||
1071 | { | ||
1072 | struct rb_node *node; | ||
1073 | struct extent_state *state; | ||
1074 | int ret = 1; | ||
1075 | |||
1076 | spin_lock(&tree->lock); | ||
1077 | /* | ||
1078 | * this search will find all the extents that end after | ||
1079 | * our range starts. | ||
1080 | */ | ||
1081 | node = tree_search(tree, start); | ||
1082 | if (!node) | ||
1083 | goto out; | ||
1084 | |||
1085 | while (1) { | ||
1086 | state = rb_entry(node, struct extent_state, rb_node); | ||
1087 | if (state->end >= start && (state->state & bits)) { | ||
1088 | *start_ret = state->start; | ||
1089 | *end_ret = state->end; | ||
1090 | ret = 0; | ||
1091 | break; | ||
1092 | } | ||
1093 | node = rb_next(node); | ||
1094 | if (!node) | ||
1095 | break; | ||
1096 | } | ||
1097 | out: | ||
1098 | spin_unlock(&tree->lock); | ||
1099 | return ret; | ||
1100 | } | ||
1101 | |||
1102 | /* find the first state struct with 'bits' set after 'start', and | 1034 | /* find the first state struct with 'bits' set after 'start', and |
1103 | * return it. tree->lock must be held. NULL will returned if | 1035 | * return it. tree->lock must be held. NULL will returned if |
1104 | * nothing was found after 'start' | 1036 | * nothing was found after 'start' |
@@ -1131,6 +1063,30 @@ out: | |||
1131 | } | 1063 | } |
1132 | 1064 | ||
1133 | /* | 1065 | /* |
1066 | * find the first offset in the io tree with 'bits' set. zero is | ||
1067 | * returned if we find something, and *start_ret and *end_ret are | ||
1068 | * set to reflect the state struct that was found. | ||
1069 | * | ||
1070 | * If nothing was found, 1 is returned, < 0 on error | ||
1071 | */ | ||
1072 | int find_first_extent_bit(struct extent_io_tree *tree, u64 start, | ||
1073 | u64 *start_ret, u64 *end_ret, int bits) | ||
1074 | { | ||
1075 | struct extent_state *state; | ||
1076 | int ret = 1; | ||
1077 | |||
1078 | spin_lock(&tree->lock); | ||
1079 | state = find_first_extent_bit_state(tree, start, bits); | ||
1080 | if (state) { | ||
1081 | *start_ret = state->start; | ||
1082 | *end_ret = state->end; | ||
1083 | ret = 0; | ||
1084 | } | ||
1085 | spin_unlock(&tree->lock); | ||
1086 | return ret; | ||
1087 | } | ||
1088 | |||
1089 | /* | ||
1134 | * find a contiguous range of bytes in the file marked as delalloc, not | 1090 | * find a contiguous range of bytes in the file marked as delalloc, not |
1135 | * more than 'max_bytes'. start and end are used to return the range, | 1091 | * more than 'max_bytes'. start and end are used to return the range, |
1136 | * | 1092 | * |
@@ -2546,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | |||
2546 | struct writeback_control *wbc) | 2502 | struct writeback_control *wbc) |
2547 | { | 2503 | { |
2548 | int ret; | 2504 | int ret; |
2549 | struct address_space *mapping = page->mapping; | ||
2550 | struct extent_page_data epd = { | 2505 | struct extent_page_data epd = { |
2551 | .bio = NULL, | 2506 | .bio = NULL, |
2552 | .tree = tree, | 2507 | .tree = tree, |
@@ -2554,17 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | |||
2554 | .extent_locked = 0, | 2509 | .extent_locked = 0, |
2555 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, | 2510 | .sync_io = wbc->sync_mode == WB_SYNC_ALL, |
2556 | }; | 2511 | }; |
2557 | struct writeback_control wbc_writepages = { | ||
2558 | .sync_mode = wbc->sync_mode, | ||
2559 | .nr_to_write = 64, | ||
2560 | .range_start = page_offset(page) + PAGE_CACHE_SIZE, | ||
2561 | .range_end = (loff_t)-1, | ||
2562 | }; | ||
2563 | 2512 | ||
2564 | ret = __extent_writepage(page, wbc, &epd); | 2513 | ret = __extent_writepage(page, wbc, &epd); |
2565 | 2514 | ||
2566 | extent_write_cache_pages(tree, mapping, &wbc_writepages, | ||
2567 | __extent_writepage, &epd, flush_write_bio); | ||
2568 | flush_epd_write_bio(&epd); | 2515 | flush_epd_write_bio(&epd); |
2569 | return ret; | 2516 | return ret; |
2570 | } | 2517 | } |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 21a7ca9e7282..7b2f0c3e7929 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
@@ -76,15 +76,15 @@ struct extent_io_ops { | |||
76 | struct extent_state *state); | 76 | struct extent_state *state); |
77 | int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, | 77 | int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, |
78 | struct extent_state *state, int uptodate); | 78 | struct extent_state *state, int uptodate); |
79 | int (*set_bit_hook)(struct inode *inode, struct extent_state *state, | 79 | void (*set_bit_hook)(struct inode *inode, struct extent_state *state, |
80 | int *bits); | 80 | int *bits); |
81 | int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, | 81 | void (*clear_bit_hook)(struct inode *inode, struct extent_state *state, |
82 | int *bits); | 82 | int *bits); |
83 | int (*merge_extent_hook)(struct inode *inode, | 83 | void (*merge_extent_hook)(struct inode *inode, |
84 | struct extent_state *new, | 84 | struct extent_state *new, |
85 | struct extent_state *other); | 85 | struct extent_state *other); |
86 | int (*split_extent_hook)(struct inode *inode, | 86 | void (*split_extent_hook)(struct inode *inode, |
87 | struct extent_state *orig, u64 split); | 87 | struct extent_state *orig, u64 split); |
88 | int (*write_cache_pages_lock_hook)(struct page *page); | 88 | int (*write_cache_pages_lock_hook)(struct page *page); |
89 | }; | 89 | }; |
90 | 90 | ||
@@ -108,8 +108,6 @@ struct extent_state { | |||
108 | wait_queue_head_t wq; | 108 | wait_queue_head_t wq; |
109 | atomic_t refs; | 109 | atomic_t refs; |
110 | unsigned long state; | 110 | unsigned long state; |
111 | u64 split_start; | ||
112 | u64 split_end; | ||
113 | 111 | ||
114 | /* for use by the FS */ | 112 | /* for use by the FS */ |
115 | u64 private; | 113 | u64 private; |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 2d0410344ea3..7c97b3301459 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) | |||
183 | return 0; | 183 | return 0; |
184 | } | 184 | } |
185 | 185 | ||
186 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) | 186 | static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) |
187 | { | 187 | { |
188 | int ret = 0; | ||
189 | struct extent_map *merge = NULL; | 188 | struct extent_map *merge = NULL; |
190 | struct rb_node *rb; | 189 | struct rb_node *rb; |
191 | struct extent_map *em; | ||
192 | |||
193 | write_lock(&tree->lock); | ||
194 | em = lookup_extent_mapping(tree, start, len); | ||
195 | |||
196 | WARN_ON(!em || em->start != start); | ||
197 | |||
198 | if (!em) | ||
199 | goto out; | ||
200 | |||
201 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
202 | 190 | ||
203 | if (em->start != 0) { | 191 | if (em->start != 0) { |
204 | rb = rb_prev(&em->rb_node); | 192 | rb = rb_prev(&em->rb_node); |
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) | |||
225 | merge->in_tree = 0; | 213 | merge->in_tree = 0; |
226 | free_extent_map(merge); | 214 | free_extent_map(merge); |
227 | } | 215 | } |
216 | } | ||
217 | |||
218 | int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) | ||
219 | { | ||
220 | int ret = 0; | ||
221 | struct extent_map *em; | ||
222 | |||
223 | write_lock(&tree->lock); | ||
224 | em = lookup_extent_mapping(tree, start, len); | ||
225 | |||
226 | WARN_ON(!em || em->start != start); | ||
227 | |||
228 | if (!em) | ||
229 | goto out; | ||
230 | |||
231 | clear_bit(EXTENT_FLAG_PINNED, &em->flags); | ||
232 | |||
233 | try_merge_map(tree, em); | ||
228 | 234 | ||
229 | free_extent_map(em); | 235 | free_extent_map(em); |
230 | out: | 236 | out: |
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree, | |||
247 | struct extent_map *em) | 253 | struct extent_map *em) |
248 | { | 254 | { |
249 | int ret = 0; | 255 | int ret = 0; |
250 | struct extent_map *merge = NULL; | ||
251 | struct rb_node *rb; | 256 | struct rb_node *rb; |
252 | struct extent_map *exist; | 257 | struct extent_map *exist; |
253 | 258 | ||
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree, | |||
263 | goto out; | 268 | goto out; |
264 | } | 269 | } |
265 | atomic_inc(&em->refs); | 270 | atomic_inc(&em->refs); |
266 | if (em->start != 0) { | 271 | |
267 | rb = rb_prev(&em->rb_node); | 272 | try_merge_map(tree, em); |
268 | if (rb) | ||
269 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
270 | if (rb && mergable_maps(merge, em)) { | ||
271 | em->start = merge->start; | ||
272 | em->len += merge->len; | ||
273 | em->block_len += merge->block_len; | ||
274 | em->block_start = merge->block_start; | ||
275 | merge->in_tree = 0; | ||
276 | rb_erase(&merge->rb_node, &tree->map); | ||
277 | free_extent_map(merge); | ||
278 | } | ||
279 | } | ||
280 | rb = rb_next(&em->rb_node); | ||
281 | if (rb) | ||
282 | merge = rb_entry(rb, struct extent_map, rb_node); | ||
283 | if (rb && mergable_maps(em, merge)) { | ||
284 | em->len += merge->len; | ||
285 | em->block_len += merge->len; | ||
286 | rb_erase(&merge->rb_node, &tree->map); | ||
287 | merge->in_tree = 0; | ||
288 | free_extent_map(merge); | ||
289 | } | ||
290 | out: | 273 | out: |
291 | return ret; | 274 | return ret; |
292 | } | 275 | } |
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len) | |||
299 | return start + len; | 282 | return start + len; |
300 | } | 283 | } |
301 | 284 | ||
302 | /** | 285 | struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree, |
303 | * lookup_extent_mapping - lookup extent_map | 286 | u64 start, u64 len, int strict) |
304 | * @tree: tree to lookup in | ||
305 | * @start: byte offset to start the search | ||
306 | * @len: length of the lookup range | ||
307 | * | ||
308 | * Find and return the first extent_map struct in @tree that intersects the | ||
309 | * [start, len] range. There may be additional objects in the tree that | ||
310 | * intersect, so check the object returned carefully to make sure that no | ||
311 | * additional lookups are needed. | ||
312 | */ | ||
313 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | ||
314 | u64 start, u64 len) | ||
315 | { | 287 | { |
316 | struct extent_map *em; | 288 | struct extent_map *em; |
317 | struct rb_node *rb_node; | 289 | struct rb_node *rb_node; |
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | |||
320 | u64 end = range_end(start, len); | 292 | u64 end = range_end(start, len); |
321 | 293 | ||
322 | rb_node = __tree_search(&tree->map, start, &prev, &next); | 294 | rb_node = __tree_search(&tree->map, start, &prev, &next); |
323 | if (!rb_node && prev) { | ||
324 | em = rb_entry(prev, struct extent_map, rb_node); | ||
325 | if (end > em->start && start < extent_map_end(em)) | ||
326 | goto found; | ||
327 | } | ||
328 | if (!rb_node && next) { | ||
329 | em = rb_entry(next, struct extent_map, rb_node); | ||
330 | if (end > em->start && start < extent_map_end(em)) | ||
331 | goto found; | ||
332 | } | ||
333 | if (!rb_node) { | 295 | if (!rb_node) { |
334 | em = NULL; | 296 | if (prev) |
335 | goto out; | 297 | rb_node = prev; |
336 | } | 298 | else if (next) |
337 | if (IS_ERR(rb_node)) { | 299 | rb_node = next; |
338 | em = ERR_CAST(rb_node); | 300 | else |
339 | goto out; | 301 | return NULL; |
340 | } | 302 | } |
303 | |||
341 | em = rb_entry(rb_node, struct extent_map, rb_node); | 304 | em = rb_entry(rb_node, struct extent_map, rb_node); |
342 | if (end > em->start && start < extent_map_end(em)) | ||
343 | goto found; | ||
344 | 305 | ||
345 | em = NULL; | 306 | if (strict && !(end > em->start && start < extent_map_end(em))) |
346 | goto out; | 307 | return NULL; |
347 | 308 | ||
348 | found: | ||
349 | atomic_inc(&em->refs); | 309 | atomic_inc(&em->refs); |
350 | out: | ||
351 | return em; | 310 | return em; |
352 | } | 311 | } |
353 | 312 | ||
354 | /** | 313 | /** |
314 | * lookup_extent_mapping - lookup extent_map | ||
315 | * @tree: tree to lookup in | ||
316 | * @start: byte offset to start the search | ||
317 | * @len: length of the lookup range | ||
318 | * | ||
319 | * Find and return the first extent_map struct in @tree that intersects the | ||
320 | * [start, len] range. There may be additional objects in the tree that | ||
321 | * intersect, so check the object returned carefully to make sure that no | ||
322 | * additional lookups are needed. | ||
323 | */ | ||
324 | struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, | ||
325 | u64 start, u64 len) | ||
326 | { | ||
327 | return __lookup_extent_mapping(tree, start, len, 1); | ||
328 | } | ||
329 | |||
330 | /** | ||
355 | * search_extent_mapping - find a nearby extent map | 331 | * search_extent_mapping - find a nearby extent map |
356 | * @tree: tree to lookup in | 332 | * @tree: tree to lookup in |
357 | * @start: byte offset to start the search | 333 | * @start: byte offset to start the search |
@@ -365,38 +341,7 @@ out: | |||
365 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, | 341 | struct extent_map *search_extent_mapping(struct extent_map_tree *tree, |
366 | u64 start, u64 len) | 342 | u64 start, u64 len) |
367 | { | 343 | { |
368 | struct extent_map *em; | 344 | return __lookup_extent_mapping(tree, start, len, 0); |
369 | struct rb_node *rb_node; | ||
370 | struct rb_node *prev = NULL; | ||
371 | struct rb_node *next = NULL; | ||
372 | |||
373 | rb_node = __tree_search(&tree->map, start, &prev, &next); | ||
374 | if (!rb_node && prev) { | ||
375 | em = rb_entry(prev, struct extent_map, rb_node); | ||
376 | goto found; | ||
377 | } | ||
378 | if (!rb_node && next) { | ||
379 | em = rb_entry(next, struct extent_map, rb_node); | ||
380 | goto found; | ||
381 | } | ||
382 | if (!rb_node) { | ||
383 | em = NULL; | ||
384 | goto out; | ||
385 | } | ||
386 | if (IS_ERR(rb_node)) { | ||
387 | em = ERR_CAST(rb_node); | ||
388 | goto out; | ||
389 | } | ||
390 | em = rb_entry(rb_node, struct extent_map, rb_node); | ||
391 | goto found; | ||
392 | |||
393 | em = NULL; | ||
394 | goto out; | ||
395 | |||
396 | found: | ||
397 | atomic_inc(&em->refs); | ||
398 | out: | ||
399 | return em; | ||
400 | } | 345 | } |
401 | 346 | ||
402 | /** | 347 | /** |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 08bcfa92a222..b910694f61ed 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
@@ -291,7 +291,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, | |||
291 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); | 291 | u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); |
292 | 292 | ||
293 | path = btrfs_alloc_path(); | 293 | path = btrfs_alloc_path(); |
294 | BUG_ON(!path); | 294 | if (!path) |
295 | return -ENOMEM; | ||
295 | 296 | ||
296 | if (search_commit) { | 297 | if (search_commit) { |
297 | path->skip_locking = 1; | 298 | path->skip_locking = 1; |
@@ -677,7 +678,9 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | |||
677 | btrfs_super_csum_size(&root->fs_info->super_copy); | 678 | btrfs_super_csum_size(&root->fs_info->super_copy); |
678 | 679 | ||
679 | path = btrfs_alloc_path(); | 680 | path = btrfs_alloc_path(); |
680 | BUG_ON(!path); | 681 | if (!path) |
682 | return -ENOMEM; | ||
683 | |||
681 | sector_sum = sums->sums; | 684 | sector_sum = sums->sums; |
682 | again: | 685 | again: |
683 | next_offset = (u64)-1; | 686 | next_offset = (u64)-1; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a35e51c9f235..658d66959abe 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -74,7 +74,7 @@ struct inode_defrag { | |||
74 | * If an existing record is found the defrag item you | 74 | * If an existing record is found the defrag item you |
75 | * pass in is freed | 75 | * pass in is freed |
76 | */ | 76 | */ |
77 | static int __btrfs_add_inode_defrag(struct inode *inode, | 77 | static void __btrfs_add_inode_defrag(struct inode *inode, |
78 | struct inode_defrag *defrag) | 78 | struct inode_defrag *defrag) |
79 | { | 79 | { |
80 | struct btrfs_root *root = BTRFS_I(inode)->root; | 80 | struct btrfs_root *root = BTRFS_I(inode)->root; |
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode, | |||
106 | BTRFS_I(inode)->in_defrag = 1; | 106 | BTRFS_I(inode)->in_defrag = 1; |
107 | rb_link_node(&defrag->rb_node, parent, p); | 107 | rb_link_node(&defrag->rb_node, parent, p); |
108 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); | 108 | rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); |
109 | return 0; | 109 | return; |
110 | 110 | ||
111 | exists: | 111 | exists: |
112 | kfree(defrag); | 112 | kfree(defrag); |
113 | return 0; | 113 | return; |
114 | 114 | ||
115 | } | 115 | } |
116 | 116 | ||
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
123 | { | 123 | { |
124 | struct btrfs_root *root = BTRFS_I(inode)->root; | 124 | struct btrfs_root *root = BTRFS_I(inode)->root; |
125 | struct inode_defrag *defrag; | 125 | struct inode_defrag *defrag; |
126 | int ret = 0; | ||
127 | u64 transid; | 126 | u64 transid; |
128 | 127 | ||
129 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) | 128 | if (!btrfs_test_opt(root, AUTO_DEFRAG)) |
@@ -150,9 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, | |||
150 | 149 | ||
151 | spin_lock(&root->fs_info->defrag_inodes_lock); | 150 | spin_lock(&root->fs_info->defrag_inodes_lock); |
152 | if (!BTRFS_I(inode)->in_defrag) | 151 | if (!BTRFS_I(inode)->in_defrag) |
153 | ret = __btrfs_add_inode_defrag(inode, defrag); | 152 | __btrfs_add_inode_defrag(inode, defrag); |
154 | spin_unlock(&root->fs_info->defrag_inodes_lock); | 153 | spin_unlock(&root->fs_info->defrag_inodes_lock); |
155 | return ret; | 154 | return 0; |
156 | } | 155 | } |
157 | 156 | ||
158 | /* | 157 | /* |
@@ -855,7 +854,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, | |||
855 | btrfs_drop_extent_cache(inode, start, end - 1, 0); | 854 | btrfs_drop_extent_cache(inode, start, end - 1, 0); |
856 | 855 | ||
857 | path = btrfs_alloc_path(); | 856 | path = btrfs_alloc_path(); |
858 | BUG_ON(!path); | 857 | if (!path) |
858 | return -ENOMEM; | ||
859 | again: | 859 | again: |
860 | recow = 0; | 860 | recow = 0; |
861 | split = start; | 861 | split = start; |
@@ -1059,7 +1059,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos) | |||
1059 | static noinline int prepare_pages(struct btrfs_root *root, struct file *file, | 1059 | static noinline int prepare_pages(struct btrfs_root *root, struct file *file, |
1060 | struct page **pages, size_t num_pages, | 1060 | struct page **pages, size_t num_pages, |
1061 | loff_t pos, unsigned long first_index, | 1061 | loff_t pos, unsigned long first_index, |
1062 | unsigned long last_index, size_t write_bytes) | 1062 | size_t write_bytes) |
1063 | { | 1063 | { |
1064 | struct extent_state *cached_state = NULL; | 1064 | struct extent_state *cached_state = NULL; |
1065 | int i; | 1065 | int i; |
@@ -1159,7 +1159,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1159 | struct btrfs_root *root = BTRFS_I(inode)->root; | 1159 | struct btrfs_root *root = BTRFS_I(inode)->root; |
1160 | struct page **pages = NULL; | 1160 | struct page **pages = NULL; |
1161 | unsigned long first_index; | 1161 | unsigned long first_index; |
1162 | unsigned long last_index; | ||
1163 | size_t num_written = 0; | 1162 | size_t num_written = 0; |
1164 | int nrptrs; | 1163 | int nrptrs; |
1165 | int ret = 0; | 1164 | int ret = 0; |
@@ -1172,7 +1171,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1172 | return -ENOMEM; | 1171 | return -ENOMEM; |
1173 | 1172 | ||
1174 | first_index = pos >> PAGE_CACHE_SHIFT; | 1173 | first_index = pos >> PAGE_CACHE_SHIFT; |
1175 | last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT; | ||
1176 | 1174 | ||
1177 | while (iov_iter_count(i) > 0) { | 1175 | while (iov_iter_count(i) > 0) { |
1178 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); | 1176 | size_t offset = pos & (PAGE_CACHE_SIZE - 1); |
@@ -1206,8 +1204,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, | |||
1206 | * contents of pages from loop to loop | 1204 | * contents of pages from loop to loop |
1207 | */ | 1205 | */ |
1208 | ret = prepare_pages(root, file, pages, num_pages, | 1206 | ret = prepare_pages(root, file, pages, num_pages, |
1209 | pos, first_index, last_index, | 1207 | pos, first_index, write_bytes); |
1210 | write_bytes); | ||
1211 | if (ret) { | 1208 | if (ret) { |
1212 | btrfs_delalloc_release_space(inode, | 1209 | btrfs_delalloc_release_space(inode, |
1213 | num_pages << PAGE_CACHE_SHIFT); | 1210 | num_pages << PAGE_CACHE_SHIFT); |
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 13e6255182e3..15fceefbca0a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
@@ -1061,7 +1061,8 @@ static noinline int run_delalloc_nocow(struct inode *inode, | |||
1061 | u64 ino = btrfs_ino(inode); | 1061 | u64 ino = btrfs_ino(inode); |
1062 | 1062 | ||
1063 | path = btrfs_alloc_path(); | 1063 | path = btrfs_alloc_path(); |
1064 | BUG_ON(!path); | 1064 | if (!path) |
1065 | return -ENOMEM; | ||
1065 | 1066 | ||
1066 | nolock = btrfs_is_free_space_inode(root, inode); | 1067 | nolock = btrfs_is_free_space_inode(root, inode); |
1067 | 1068 | ||
@@ -1282,17 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, | |||
1282 | return ret; | 1283 | return ret; |
1283 | } | 1284 | } |
1284 | 1285 | ||
1285 | static int btrfs_split_extent_hook(struct inode *inode, | 1286 | static void btrfs_split_extent_hook(struct inode *inode, |
1286 | struct extent_state *orig, u64 split) | 1287 | struct extent_state *orig, u64 split) |
1287 | { | 1288 | { |
1288 | /* not delalloc, ignore it */ | 1289 | /* not delalloc, ignore it */ |
1289 | if (!(orig->state & EXTENT_DELALLOC)) | 1290 | if (!(orig->state & EXTENT_DELALLOC)) |
1290 | return 0; | 1291 | return; |
1291 | 1292 | ||
1292 | spin_lock(&BTRFS_I(inode)->lock); | 1293 | spin_lock(&BTRFS_I(inode)->lock); |
1293 | BTRFS_I(inode)->outstanding_extents++; | 1294 | BTRFS_I(inode)->outstanding_extents++; |
1294 | spin_unlock(&BTRFS_I(inode)->lock); | 1295 | spin_unlock(&BTRFS_I(inode)->lock); |
1295 | return 0; | ||
1296 | } | 1296 | } |
1297 | 1297 | ||
1298 | /* | 1298 | /* |
@@ -1301,18 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode, | |||
1301 | * extents, such as when we are doing sequential writes, so we can properly | 1301 | * extents, such as when we are doing sequential writes, so we can properly |
1302 | * account for the metadata space we'll need. | 1302 | * account for the metadata space we'll need. |
1303 | */ | 1303 | */ |
1304 | static int btrfs_merge_extent_hook(struct inode *inode, | 1304 | static void btrfs_merge_extent_hook(struct inode *inode, |
1305 | struct extent_state *new, | 1305 | struct extent_state *new, |
1306 | struct extent_state *other) | 1306 | struct extent_state *other) |
1307 | { | 1307 | { |
1308 | /* not delalloc, ignore it */ | 1308 | /* not delalloc, ignore it */ |
1309 | if (!(other->state & EXTENT_DELALLOC)) | 1309 | if (!(other->state & EXTENT_DELALLOC)) |
1310 | return 0; | 1310 | return; |
1311 | 1311 | ||
1312 | spin_lock(&BTRFS_I(inode)->lock); | 1312 | spin_lock(&BTRFS_I(inode)->lock); |
1313 | BTRFS_I(inode)->outstanding_extents--; | 1313 | BTRFS_I(inode)->outstanding_extents--; |
1314 | spin_unlock(&BTRFS_I(inode)->lock); | 1314 | spin_unlock(&BTRFS_I(inode)->lock); |
1315 | return 0; | ||
1316 | } | 1315 | } |
1317 | 1316 | ||
1318 | /* | 1317 | /* |
@@ -1320,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode, | |||
1320 | * bytes in this file, and to maintain the list of inodes that | 1319 | * bytes in this file, and to maintain the list of inodes that |
1321 | * have pending delalloc work to be done. | 1320 | * have pending delalloc work to be done. |
1322 | */ | 1321 | */ |
1323 | static int btrfs_set_bit_hook(struct inode *inode, | 1322 | static void btrfs_set_bit_hook(struct inode *inode, |
1324 | struct extent_state *state, int *bits) | 1323 | struct extent_state *state, int *bits) |
1325 | { | 1324 | { |
1326 | 1325 | ||
1327 | /* | 1326 | /* |
@@ -1351,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode, | |||
1351 | } | 1350 | } |
1352 | spin_unlock(&root->fs_info->delalloc_lock); | 1351 | spin_unlock(&root->fs_info->delalloc_lock); |
1353 | } | 1352 | } |
1354 | return 0; | ||
1355 | } | 1353 | } |
1356 | 1354 | ||
1357 | /* | 1355 | /* |
1358 | * extent_io.c clear_bit_hook, see set_bit_hook for why | 1356 | * extent_io.c clear_bit_hook, see set_bit_hook for why |
1359 | */ | 1357 | */ |
1360 | static int btrfs_clear_bit_hook(struct inode *inode, | 1358 | static void btrfs_clear_bit_hook(struct inode *inode, |
1361 | struct extent_state *state, int *bits) | 1359 | struct extent_state *state, int *bits) |
1362 | { | 1360 | { |
1363 | /* | 1361 | /* |
1364 | * set_bit and clear bit hooks normally require _irqsave/restore | 1362 | * set_bit and clear bit hooks normally require _irqsave/restore |
@@ -1395,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode, | |||
1395 | } | 1393 | } |
1396 | spin_unlock(&root->fs_info->delalloc_lock); | 1394 | spin_unlock(&root->fs_info->delalloc_lock); |
1397 | } | 1395 | } |
1398 | return 0; | ||
1399 | } | 1396 | } |
1400 | 1397 | ||
1401 | /* | 1398 | /* |
@@ -1645,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
1645 | int ret; | 1642 | int ret; |
1646 | 1643 | ||
1647 | path = btrfs_alloc_path(); | 1644 | path = btrfs_alloc_path(); |
1648 | BUG_ON(!path); | 1645 | if (!path) |
1646 | return -ENOMEM; | ||
1649 | 1647 | ||
1650 | path->leave_spinning = 1; | 1648 | path->leave_spinning = 1; |
1651 | 1649 | ||
@@ -2215,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) | |||
2215 | 2213 | ||
2216 | if (!root->orphan_block_rsv) { | 2214 | if (!root->orphan_block_rsv) { |
2217 | block_rsv = btrfs_alloc_block_rsv(root); | 2215 | block_rsv = btrfs_alloc_block_rsv(root); |
2218 | BUG_ON(!block_rsv); | 2216 | if (!block_rsv) |
2217 | return -ENOMEM; | ||
2219 | } | 2218 | } |
2220 | 2219 | ||
2221 | spin_lock(&root->orphan_lock); | 2220 | spin_lock(&root->orphan_lock); |
@@ -2517,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode) | |||
2517 | filled = true; | 2516 | filled = true; |
2518 | 2517 | ||
2519 | path = btrfs_alloc_path(); | 2518 | path = btrfs_alloc_path(); |
2520 | BUG_ON(!path); | 2519 | if (!path) |
2520 | goto make_bad; | ||
2521 | |||
2521 | path->leave_spinning = 1; | 2522 | path->leave_spinning = 1; |
2522 | memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); | 2523 | memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); |
2523 | 2524 | ||
@@ -2998,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) | |||
2998 | 2999 | ||
2999 | ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, | 3000 | ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, |
3000 | dentry->d_name.name, dentry->d_name.len); | 3001 | dentry->d_name.name, dentry->d_name.len); |
3001 | BUG_ON(ret); | 3002 | if (ret) |
3003 | goto out; | ||
3002 | 3004 | ||
3003 | if (inode->i_nlink == 0) { | 3005 | if (inode->i_nlink == 0) { |
3004 | ret = btrfs_orphan_add(trans, inode); | 3006 | ret = btrfs_orphan_add(trans, inode); |
3005 | BUG_ON(ret); | 3007 | if (ret) |
3008 | goto out; | ||
3006 | } | 3009 | } |
3007 | 3010 | ||
3011 | out: | ||
3008 | nr = trans->blocks_used; | 3012 | nr = trans->blocks_used; |
3009 | __unlink_end_trans(trans, root); | 3013 | __unlink_end_trans(trans, root); |
3010 | btrfs_btree_balance_dirty(root, nr); | 3014 | btrfs_btree_balance_dirty(root, nr); |
@@ -3147,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3147 | 3151 | ||
3148 | BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); | 3152 | BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); |
3149 | 3153 | ||
3154 | path = btrfs_alloc_path(); | ||
3155 | if (!path) | ||
3156 | return -ENOMEM; | ||
3157 | path->reada = -1; | ||
3158 | |||
3150 | if (root->ref_cows || root == root->fs_info->tree_root) | 3159 | if (root->ref_cows || root == root->fs_info->tree_root) |
3151 | btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); | 3160 | btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); |
3152 | 3161 | ||
@@ -3159,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
3159 | if (min_type == 0 && root == BTRFS_I(inode)->root) | 3168 | if (min_type == 0 && root == BTRFS_I(inode)->root) |
3160 | btrfs_kill_delayed_inode_items(inode); | 3169 | btrfs_kill_delayed_inode_items(inode); |
3161 | 3170 | ||
3162 | path = btrfs_alloc_path(); | ||
3163 | BUG_ON(!path); | ||
3164 | path->reada = -1; | ||
3165 | |||
3166 | key.objectid = ino; | 3171 | key.objectid = ino; |
3167 | key.offset = (u64)-1; | 3172 | key.offset = (u64)-1; |
3168 | key.type = (u8)-1; | 3173 | key.type = (u8)-1; |
@@ -3690,7 +3695,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, | |||
3690 | int ret = 0; | 3695 | int ret = 0; |
3691 | 3696 | ||
3692 | path = btrfs_alloc_path(); | 3697 | path = btrfs_alloc_path(); |
3693 | BUG_ON(!path); | 3698 | if (!path) |
3699 | return -ENOMEM; | ||
3694 | 3700 | ||
3695 | di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, | 3701 | di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, |
3696 | namelen, 0); | 3702 | namelen, 0); |
@@ -3946,6 +3952,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, | |||
3946 | struct btrfs_root *root, int *new) | 3952 | struct btrfs_root *root, int *new) |
3947 | { | 3953 | { |
3948 | struct inode *inode; | 3954 | struct inode *inode; |
3955 | int bad_inode = 0; | ||
3949 | 3956 | ||
3950 | inode = btrfs_iget_locked(s, location->objectid, root); | 3957 | inode = btrfs_iget_locked(s, location->objectid, root); |
3951 | if (!inode) | 3958 | if (!inode) |
@@ -3955,10 +3962,19 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, | |||
3955 | BTRFS_I(inode)->root = root; | 3962 | BTRFS_I(inode)->root = root; |
3956 | memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); | 3963 | memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); |
3957 | btrfs_read_locked_inode(inode); | 3964 | btrfs_read_locked_inode(inode); |
3958 | inode_tree_add(inode); | 3965 | if (!is_bad_inode(inode)) { |
3959 | unlock_new_inode(inode); | 3966 | inode_tree_add(inode); |
3960 | if (new) | 3967 | unlock_new_inode(inode); |
3961 | *new = 1; | 3968 | if (new) |
3969 | *new = 1; | ||
3970 | } else { | ||
3971 | bad_inode = 1; | ||
3972 | } | ||
3973 | } | ||
3974 | |||
3975 | if (bad_inode) { | ||
3976 | iput(inode); | ||
3977 | inode = ERR_PTR(-ESTALE); | ||
3962 | } | 3978 | } |
3963 | 3979 | ||
3964 | return inode; | 3980 | return inode; |
@@ -3993,12 +4009,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) | |||
3993 | struct btrfs_root *sub_root = root; | 4009 | struct btrfs_root *sub_root = root; |
3994 | struct btrfs_key location; | 4010 | struct btrfs_key location; |
3995 | int index; | 4011 | int index; |
3996 | int ret; | 4012 | int ret = 0; |
3997 | 4013 | ||
3998 | if (dentry->d_name.len > BTRFS_NAME_LEN) | 4014 | if (dentry->d_name.len > BTRFS_NAME_LEN) |
3999 | return ERR_PTR(-ENAMETOOLONG); | 4015 | return ERR_PTR(-ENAMETOOLONG); |
4000 | 4016 | ||
4001 | ret = btrfs_inode_by_name(dir, dentry, &location); | 4017 | if (unlikely(d_need_lookup(dentry))) { |
4018 | memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key)); | ||
4019 | kfree(dentry->d_fsdata); | ||
4020 | dentry->d_fsdata = NULL; | ||
4021 | d_clear_need_lookup(dentry); | ||
4022 | } else { | ||
4023 | ret = btrfs_inode_by_name(dir, dentry, &location); | ||
4024 | } | ||
4002 | 4025 | ||
4003 | if (ret < 0) | 4026 | if (ret < 0) |
4004 | return ERR_PTR(ret); | 4027 | return ERR_PTR(ret); |
@@ -4053,6 +4076,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry) | |||
4053 | return 0; | 4076 | return 0; |
4054 | } | 4077 | } |
4055 | 4078 | ||
4079 | static void btrfs_dentry_release(struct dentry *dentry) | ||
4080 | { | ||
4081 | if (dentry->d_fsdata) | ||
4082 | kfree(dentry->d_fsdata); | ||
4083 | } | ||
4084 | |||
4056 | static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, | 4085 | static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, |
4057 | struct nameidata *nd) | 4086 | struct nameidata *nd) |
4058 | { | 4087 | { |
@@ -4075,6 +4104,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, | |||
4075 | struct btrfs_path *path; | 4104 | struct btrfs_path *path; |
4076 | struct list_head ins_list; | 4105 | struct list_head ins_list; |
4077 | struct list_head del_list; | 4106 | struct list_head del_list; |
4107 | struct qstr q; | ||
4078 | int ret; | 4108 | int ret; |
4079 | struct extent_buffer *leaf; | 4109 | struct extent_buffer *leaf; |
4080 | int slot; | 4110 | int slot; |
@@ -4164,6 +4194,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, | |||
4164 | 4194 | ||
4165 | while (di_cur < di_total) { | 4195 | while (di_cur < di_total) { |
4166 | struct btrfs_key location; | 4196 | struct btrfs_key location; |
4197 | struct dentry *tmp; | ||
4167 | 4198 | ||
4168 | if (verify_dir_item(root, leaf, di)) | 4199 | if (verify_dir_item(root, leaf, di)) |
4169 | break; | 4200 | break; |
@@ -4184,6 +4215,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, | |||
4184 | d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; | 4215 | d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; |
4185 | btrfs_dir_item_key_to_cpu(leaf, di, &location); | 4216 | btrfs_dir_item_key_to_cpu(leaf, di, &location); |
4186 | 4217 | ||
4218 | q.name = name_ptr; | ||
4219 | q.len = name_len; | ||
4220 | q.hash = full_name_hash(q.name, q.len); | ||
4221 | tmp = d_lookup(filp->f_dentry, &q); | ||
4222 | if (!tmp) { | ||
4223 | struct btrfs_key *newkey; | ||
4224 | |||
4225 | newkey = kzalloc(sizeof(struct btrfs_key), | ||
4226 | GFP_NOFS); | ||
4227 | if (!newkey) | ||
4228 | goto no_dentry; | ||
4229 | tmp = d_alloc(filp->f_dentry, &q); | ||
4230 | if (!tmp) { | ||
4231 | kfree(newkey); | ||
4232 | dput(tmp); | ||
4233 | goto no_dentry; | ||
4234 | } | ||
4235 | memcpy(newkey, &location, | ||
4236 | sizeof(struct btrfs_key)); | ||
4237 | tmp->d_fsdata = newkey; | ||
4238 | tmp->d_flags |= DCACHE_NEED_LOOKUP; | ||
4239 | d_rehash(tmp); | ||
4240 | dput(tmp); | ||
4241 | } else { | ||
4242 | dput(tmp); | ||
4243 | } | ||
4244 | no_dentry: | ||
4187 | /* is this a reference to our own snapshot? If so | 4245 | /* is this a reference to our own snapshot? If so |
4188 | * skip it | 4246 | * skip it |
4189 | */ | 4247 | */ |
@@ -4409,7 +4467,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | |||
4409 | int owner; | 4467 | int owner; |
4410 | 4468 | ||
4411 | path = btrfs_alloc_path(); | 4469 | path = btrfs_alloc_path(); |
4412 | BUG_ON(!path); | 4470 | if (!path) |
4471 | return ERR_PTR(-ENOMEM); | ||
4413 | 4472 | ||
4414 | inode = new_inode(root->fs_info->sb); | 4473 | inode = new_inode(root->fs_info->sb); |
4415 | if (!inode) { | 4474 | if (!inode) { |
@@ -6669,19 +6728,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | |||
6669 | return 0; | 6728 | return 0; |
6670 | } | 6729 | } |
6671 | 6730 | ||
6672 | /* helper function for file defrag and space balancing. This | ||
6673 | * forces readahead on a given range of bytes in an inode | ||
6674 | */ | ||
6675 | unsigned long btrfs_force_ra(struct address_space *mapping, | ||
6676 | struct file_ra_state *ra, struct file *file, | ||
6677 | pgoff_t offset, pgoff_t last_index) | ||
6678 | { | ||
6679 | pgoff_t req_size = last_index - offset + 1; | ||
6680 | |||
6681 | page_cache_sync_readahead(mapping, ra, file, offset, req_size); | ||
6682 | return offset + req_size; | ||
6683 | } | ||
6684 | |||
6685 | struct inode *btrfs_alloc_inode(struct super_block *sb) | 6731 | struct inode *btrfs_alloc_inode(struct super_block *sb) |
6686 | { | 6732 | { |
6687 | struct btrfs_inode *ei; | 6733 | struct btrfs_inode *ei; |
@@ -7164,7 +7210,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, | |||
7164 | goto out_unlock; | 7210 | goto out_unlock; |
7165 | 7211 | ||
7166 | path = btrfs_alloc_path(); | 7212 | path = btrfs_alloc_path(); |
7167 | BUG_ON(!path); | 7213 | if (!path) { |
7214 | err = -ENOMEM; | ||
7215 | drop_inode = 1; | ||
7216 | goto out_unlock; | ||
7217 | } | ||
7168 | key.objectid = btrfs_ino(inode); | 7218 | key.objectid = btrfs_ino(inode); |
7169 | key.offset = 0; | 7219 | key.offset = 0; |
7170 | btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); | 7220 | btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); |
@@ -7430,4 +7480,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = { | |||
7430 | 7480 | ||
7431 | const struct dentry_operations btrfs_dentry_operations = { | 7481 | const struct dentry_operations btrfs_dentry_operations = { |
7432 | .d_delete = btrfs_dentry_delete, | 7482 | .d_delete = btrfs_dentry_delete, |
7483 | .d_release = btrfs_dentry_release, | ||
7433 | }; | 7484 | }; |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0b980afc5edd..7cf013349941 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
@@ -1749,11 +1749,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, | |||
1749 | key.objectid = key.offset; | 1749 | key.objectid = key.offset; |
1750 | key.offset = (u64)-1; | 1750 | key.offset = (u64)-1; |
1751 | dirid = key.objectid; | 1751 | dirid = key.objectid; |
1752 | |||
1753 | } | 1752 | } |
1754 | if (ptr < name) | 1753 | if (ptr < name) |
1755 | goto out; | 1754 | goto out; |
1756 | memcpy(name, ptr, total_len); | 1755 | memmove(name, ptr, total_len); |
1757 | name[total_len]='\0'; | 1756 | name[total_len]='\0'; |
1758 | ret = 0; | 1757 | ret = 0; |
1759 | out: | 1758 | out: |
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c deleted file mode 100644 index 82d569cb6267..000000000000 --- a/fs/btrfs/ref-cache.c +++ /dev/null | |||
@@ -1,68 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | |||
19 | #include <linux/sched.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/sort.h> | ||
22 | #include "ctree.h" | ||
23 | #include "ref-cache.h" | ||
24 | #include "transaction.h" | ||
25 | |||
26 | static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, | ||
27 | struct rb_node *node) | ||
28 | { | ||
29 | struct rb_node **p = &root->rb_node; | ||
30 | struct rb_node *parent = NULL; | ||
31 | struct btrfs_leaf_ref *entry; | ||
32 | |||
33 | while (*p) { | ||
34 | parent = *p; | ||
35 | entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); | ||
36 | |||
37 | if (bytenr < entry->bytenr) | ||
38 | p = &(*p)->rb_left; | ||
39 | else if (bytenr > entry->bytenr) | ||
40 | p = &(*p)->rb_right; | ||
41 | else | ||
42 | return parent; | ||
43 | } | ||
44 | |||
45 | entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); | ||
46 | rb_link_node(node, parent, p); | ||
47 | rb_insert_color(node, root); | ||
48 | return NULL; | ||
49 | } | ||
50 | |||
51 | static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) | ||
52 | { | ||
53 | struct rb_node *n = root->rb_node; | ||
54 | struct btrfs_leaf_ref *entry; | ||
55 | |||
56 | while (n) { | ||
57 | entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); | ||
58 | WARN_ON(!entry->in_tree); | ||
59 | |||
60 | if (bytenr < entry->bytenr) | ||
61 | n = n->rb_left; | ||
62 | else if (bytenr > entry->bytenr) | ||
63 | n = n->rb_right; | ||
64 | else | ||
65 | return n; | ||
66 | } | ||
67 | return NULL; | ||
68 | } | ||
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h deleted file mode 100644 index 24f7001f6387..000000000000 --- a/fs/btrfs/ref-cache.h +++ /dev/null | |||
@@ -1,52 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public | ||
6 | * License v2 as published by the Free Software Foundation. | ||
7 | * | ||
8 | * This program is distributed in the hope that it will be useful, | ||
9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
11 | * General Public License for more details. | ||
12 | * | ||
13 | * You should have received a copy of the GNU General Public | ||
14 | * License along with this program; if not, write to the | ||
15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
16 | * Boston, MA 021110-1307, USA. | ||
17 | */ | ||
18 | #ifndef __REFCACHE__ | ||
19 | #define __REFCACHE__ | ||
20 | |||
21 | struct btrfs_extent_info { | ||
22 | /* bytenr and num_bytes find the extent in the extent allocation tree */ | ||
23 | u64 bytenr; | ||
24 | u64 num_bytes; | ||
25 | |||
26 | /* objectid and offset find the back reference for the file */ | ||
27 | u64 objectid; | ||
28 | u64 offset; | ||
29 | }; | ||
30 | |||
31 | struct btrfs_leaf_ref { | ||
32 | struct rb_node rb_node; | ||
33 | struct btrfs_leaf_ref_tree *tree; | ||
34 | int in_tree; | ||
35 | atomic_t usage; | ||
36 | |||
37 | u64 root_gen; | ||
38 | u64 bytenr; | ||
39 | u64 owner; | ||
40 | u64 generation; | ||
41 | int nritems; | ||
42 | |||
43 | struct list_head list; | ||
44 | struct btrfs_extent_info extents[]; | ||
45 | }; | ||
46 | |||
47 | static inline size_t btrfs_leaf_ref_size(int nr_extents) | ||
48 | { | ||
49 | return sizeof(struct btrfs_leaf_ref) + | ||
50 | sizeof(struct btrfs_extent_info) * nr_extents; | ||
51 | } | ||
52 | #endif | ||
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ebe45443de06..f4099904565a 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c | |||
@@ -71,13 +71,12 @@ out: | |||
71 | return ret; | 71 | return ret; |
72 | } | 72 | } |
73 | 73 | ||
74 | int btrfs_set_root_node(struct btrfs_root_item *item, | 74 | void btrfs_set_root_node(struct btrfs_root_item *item, |
75 | struct extent_buffer *node) | 75 | struct extent_buffer *node) |
76 | { | 76 | { |
77 | btrfs_set_root_bytenr(item, node->start); | 77 | btrfs_set_root_bytenr(item, node->start); |
78 | btrfs_set_root_level(item, btrfs_header_level(node)); | 78 | btrfs_set_root_level(item, btrfs_header_level(node)); |
79 | btrfs_set_root_generation(item, btrfs_header_generation(node)); | 79 | btrfs_set_root_generation(item, btrfs_header_generation(node)); |
80 | return 0; | ||
81 | } | 80 | } |
82 | 81 | ||
83 | /* | 82 | /* |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index eb55863bb4ae..7dc36fab4afc 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
@@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root) | |||
216 | spin_lock(&root->fs_info->trans_lock); | 216 | spin_lock(&root->fs_info->trans_lock); |
217 | cur_trans = root->fs_info->running_transaction; | 217 | cur_trans = root->fs_info->running_transaction; |
218 | if (cur_trans && cur_trans->blocked) { | 218 | if (cur_trans && cur_trans->blocked) { |
219 | DEFINE_WAIT(wait); | ||
220 | atomic_inc(&cur_trans->use_count); | 219 | atomic_inc(&cur_trans->use_count); |
221 | spin_unlock(&root->fs_info->trans_lock); | 220 | spin_unlock(&root->fs_info->trans_lock); |
222 | while (1) { | 221 | |
223 | prepare_to_wait(&root->fs_info->transaction_wait, &wait, | 222 | wait_event(root->fs_info->transaction_wait, |
224 | TASK_UNINTERRUPTIBLE); | 223 | !cur_trans->blocked); |
225 | if (!cur_trans->blocked) | ||
226 | break; | ||
227 | schedule(); | ||
228 | } | ||
229 | finish_wait(&root->fs_info->transaction_wait, &wait); | ||
230 | put_transaction(cur_trans); | 224 | put_transaction(cur_trans); |
231 | } else { | 225 | } else { |
232 | spin_unlock(&root->fs_info->trans_lock); | 226 | spin_unlock(&root->fs_info->trans_lock); |
@@ -357,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root | |||
357 | } | 351 | } |
358 | 352 | ||
359 | /* wait for a transaction commit to be fully complete */ | 353 | /* wait for a transaction commit to be fully complete */ |
360 | static noinline int wait_for_commit(struct btrfs_root *root, | 354 | static noinline void wait_for_commit(struct btrfs_root *root, |
361 | struct btrfs_transaction *commit) | 355 | struct btrfs_transaction *commit) |
362 | { | 356 | { |
363 | DEFINE_WAIT(wait); | 357 | wait_event(commit->commit_wait, commit->commit_done); |
364 | while (!commit->commit_done) { | ||
365 | prepare_to_wait(&commit->commit_wait, &wait, | ||
366 | TASK_UNINTERRUPTIBLE); | ||
367 | if (commit->commit_done) | ||
368 | break; | ||
369 | schedule(); | ||
370 | } | ||
371 | finish_wait(&commit->commit_wait, &wait); | ||
372 | return 0; | ||
373 | } | 358 | } |
374 | 359 | ||
375 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) | 360 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) |
@@ -1085,22 +1070,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) | |||
1085 | static void wait_current_trans_commit_start(struct btrfs_root *root, | 1070 | static void wait_current_trans_commit_start(struct btrfs_root *root, |
1086 | struct btrfs_transaction *trans) | 1071 | struct btrfs_transaction *trans) |
1087 | { | 1072 | { |
1088 | DEFINE_WAIT(wait); | 1073 | wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit); |
1089 | |||
1090 | if (trans->in_commit) | ||
1091 | return; | ||
1092 | |||
1093 | while (1) { | ||
1094 | prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait, | ||
1095 | TASK_UNINTERRUPTIBLE); | ||
1096 | if (trans->in_commit) { | ||
1097 | finish_wait(&root->fs_info->transaction_blocked_wait, | ||
1098 | &wait); | ||
1099 | break; | ||
1100 | } | ||
1101 | schedule(); | ||
1102 | finish_wait(&root->fs_info->transaction_blocked_wait, &wait); | ||
1103 | } | ||
1104 | } | 1074 | } |
1105 | 1075 | ||
1106 | /* | 1076 | /* |
@@ -1110,24 +1080,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root, | |||
1110 | static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, | 1080 | static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, |
1111 | struct btrfs_transaction *trans) | 1081 | struct btrfs_transaction *trans) |
1112 | { | 1082 | { |
1113 | DEFINE_WAIT(wait); | 1083 | wait_event(root->fs_info->transaction_wait, |
1114 | 1084 | trans->commit_done || (trans->in_commit && !trans->blocked)); | |
1115 | if (trans->commit_done || (trans->in_commit && !trans->blocked)) | ||
1116 | return; | ||
1117 | |||
1118 | while (1) { | ||
1119 | prepare_to_wait(&root->fs_info->transaction_wait, &wait, | ||
1120 | TASK_UNINTERRUPTIBLE); | ||
1121 | if (trans->commit_done || | ||
1122 | (trans->in_commit && !trans->blocked)) { | ||
1123 | finish_wait(&root->fs_info->transaction_wait, | ||
1124 | &wait); | ||
1125 | break; | ||
1126 | } | ||
1127 | schedule(); | ||
1128 | finish_wait(&root->fs_info->transaction_wait, | ||
1129 | &wait); | ||
1130 | } | ||
1131 | } | 1085 | } |
1132 | 1086 | ||
1133 | /* | 1087 | /* |
@@ -1234,8 +1188,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
1234 | atomic_inc(&cur_trans->use_count); | 1188 | atomic_inc(&cur_trans->use_count); |
1235 | btrfs_end_transaction(trans, root); | 1189 | btrfs_end_transaction(trans, root); |
1236 | 1190 | ||
1237 | ret = wait_for_commit(root, cur_trans); | 1191 | wait_for_commit(root, cur_trans); |
1238 | BUG_ON(ret); | ||
1239 | 1192 | ||
1240 | put_transaction(cur_trans); | 1193 | put_transaction(cur_trans); |
1241 | 1194 | ||
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ac278dd83175..babee65f8eda 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -1617,7 +1617,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, | |||
1617 | return 0; | 1617 | return 0; |
1618 | 1618 | ||
1619 | path = btrfs_alloc_path(); | 1619 | path = btrfs_alloc_path(); |
1620 | BUG_ON(!path); | 1620 | if (!path) |
1621 | return -ENOMEM; | ||
1621 | 1622 | ||
1622 | nritems = btrfs_header_nritems(eb); | 1623 | nritems = btrfs_header_nritems(eb); |
1623 | for (i = 0; i < nritems; i++) { | 1624 | for (i = 0; i < nritems; i++) { |
@@ -1723,7 +1724,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, | |||
1723 | return -ENOMEM; | 1724 | return -ENOMEM; |
1724 | 1725 | ||
1725 | if (*level == 1) { | 1726 | if (*level == 1) { |
1726 | wc->process_func(root, next, wc, ptr_gen); | 1727 | ret = wc->process_func(root, next, wc, ptr_gen); |
1728 | if (ret) | ||
1729 | return ret; | ||
1727 | 1730 | ||
1728 | path->slots[*level]++; | 1731 | path->slots[*level]++; |
1729 | if (wc->free) { | 1732 | if (wc->free) { |
@@ -1788,8 +1791,11 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, | |||
1788 | parent = path->nodes[*level + 1]; | 1791 | parent = path->nodes[*level + 1]; |
1789 | 1792 | ||
1790 | root_owner = btrfs_header_owner(parent); | 1793 | root_owner = btrfs_header_owner(parent); |
1791 | wc->process_func(root, path->nodes[*level], wc, | 1794 | ret = wc->process_func(root, path->nodes[*level], wc, |
1792 | btrfs_header_generation(path->nodes[*level])); | 1795 | btrfs_header_generation(path->nodes[*level])); |
1796 | if (ret) | ||
1797 | return ret; | ||
1798 | |||
1793 | if (wc->free) { | 1799 | if (wc->free) { |
1794 | struct extent_buffer *next; | 1800 | struct extent_buffer *next; |
1795 | 1801 | ||
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b89e372c7544..53875ae73ad4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
@@ -1037,7 +1037,8 @@ static noinline int find_next_chunk(struct btrfs_root *root, | |||
1037 | struct btrfs_key found_key; | 1037 | struct btrfs_key found_key; |
1038 | 1038 | ||
1039 | path = btrfs_alloc_path(); | 1039 | path = btrfs_alloc_path(); |
1040 | BUG_ON(!path); | 1040 | if (!path) |
1041 | return -ENOMEM; | ||
1041 | 1042 | ||
1042 | key.objectid = objectid; | 1043 | key.objectid = objectid; |
1043 | key.offset = (u64)-1; | 1044 | key.offset = (u64)-1; |
@@ -2061,8 +2062,10 @@ int btrfs_balance(struct btrfs_root *dev_root) | |||
2061 | 2062 | ||
2062 | /* step two, relocate all the chunks */ | 2063 | /* step two, relocate all the chunks */ |
2063 | path = btrfs_alloc_path(); | 2064 | path = btrfs_alloc_path(); |
2064 | BUG_ON(!path); | 2065 | if (!path) { |
2065 | 2066 | ret = -ENOMEM; | |
2067 | goto error; | ||
2068 | } | ||
2066 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; | 2069 | key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; |
2067 | key.offset = (u64)-1; | 2070 | key.offset = (u64)-1; |
2068 | key.type = BTRFS_CHUNK_ITEM_KEY; | 2071 | key.type = BTRFS_CHUNK_ITEM_KEY; |
@@ -2661,7 +2664,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
2661 | 2664 | ||
2662 | ret = find_next_chunk(fs_info->chunk_root, | 2665 | ret = find_next_chunk(fs_info->chunk_root, |
2663 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); | 2666 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); |
2664 | BUG_ON(ret); | 2667 | if (ret) |
2668 | return ret; | ||
2665 | 2669 | ||
2666 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | 2670 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | |
2667 | (fs_info->metadata_alloc_profile & | 2671 | (fs_info->metadata_alloc_profile & |
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 8d8f28c94c0f..6873bb634a97 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c | |||
@@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata, | |||
141 | 141 | ||
142 | rc = dns_resolve_server_name_to_ip(*devname, &srvIP); | 142 | rc = dns_resolve_server_name_to_ip(*devname, &srvIP); |
143 | if (rc < 0) { | 143 | if (rc < 0) { |
144 | cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", | 144 | cFYI(1, "%s: Failed to resolve server part of %s to IP: %d", |
145 | __func__, *devname, rc); | 145 | __func__, *devname, rc); |
146 | goto compose_mount_options_err; | 146 | goto compose_mount_options_err; |
147 | } | 147 | } |
148 | |||
148 | /* md_len = strlen(...) + 12 for 'sep+prefixpath=' | 149 | /* md_len = strlen(...) + 12 for 'sep+prefixpath=' |
149 | * assuming that we have 'unc=' and 'ip=' in | 150 | * assuming that we have 'unc=' and 'ip=' in |
150 | * the original sb_mountdata | 151 | * the original sb_mountdata |
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 259991bd2112..e76bfeb68267 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c | |||
@@ -87,9 +87,15 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, | |||
87 | if ((cifs_pdu == NULL) || (server == NULL)) | 87 | if ((cifs_pdu == NULL) || (server == NULL)) |
88 | return -EINVAL; | 88 | return -EINVAL; |
89 | 89 | ||
90 | if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) | 90 | if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || |
91 | server->tcpStatus == CifsNeedNegotiate) | ||
91 | return rc; | 92 | return rc; |
92 | 93 | ||
94 | if (!server->session_estab) { | ||
95 | strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); | ||
96 | return rc; | ||
97 | } | ||
98 | |||
93 | cifs_pdu->Signature.Sequence.SequenceNumber = | 99 | cifs_pdu->Signature.Sequence.SequenceNumber = |
94 | cpu_to_le32(server->sequence_number); | 100 | cpu_to_le32(server->sequence_number); |
95 | cifs_pdu->Signature.Sequence.Reserved = 0; | 101 | cifs_pdu->Signature.Sequence.Reserved = 0; |
@@ -178,9 +184,15 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, | |||
178 | if ((cifs_pdu == NULL) || (server == NULL)) | 184 | if ((cifs_pdu == NULL) || (server == NULL)) |
179 | return -EINVAL; | 185 | return -EINVAL; |
180 | 186 | ||
181 | if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) | 187 | if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) || |
188 | server->tcpStatus == CifsNeedNegotiate) | ||
182 | return rc; | 189 | return rc; |
183 | 190 | ||
191 | if (!server->session_estab) { | ||
192 | strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8); | ||
193 | return rc; | ||
194 | } | ||
195 | |||
184 | cifs_pdu->Signature.Sequence.SequenceNumber = | 196 | cifs_pdu->Signature.Sequence.SequenceNumber = |
185 | cpu_to_le32(server->sequence_number); | 197 | cpu_to_le32(server->sequence_number); |
186 | cifs_pdu->Signature.Sequence.Reserved = 0; | 198 | cifs_pdu->Signature.Sequence.Reserved = 0; |
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 865517470967..f93eb948d071 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c | |||
@@ -86,24 +86,6 @@ extern mempool_t *cifs_sm_req_poolp; | |||
86 | extern mempool_t *cifs_req_poolp; | 86 | extern mempool_t *cifs_req_poolp; |
87 | extern mempool_t *cifs_mid_poolp; | 87 | extern mempool_t *cifs_mid_poolp; |
88 | 88 | ||
89 | void | ||
90 | cifs_sb_active(struct super_block *sb) | ||
91 | { | ||
92 | struct cifs_sb_info *server = CIFS_SB(sb); | ||
93 | |||
94 | if (atomic_inc_return(&server->active) == 1) | ||
95 | atomic_inc(&sb->s_active); | ||
96 | } | ||
97 | |||
98 | void | ||
99 | cifs_sb_deactive(struct super_block *sb) | ||
100 | { | ||
101 | struct cifs_sb_info *server = CIFS_SB(sb); | ||
102 | |||
103 | if (atomic_dec_and_test(&server->active)) | ||
104 | deactivate_super(sb); | ||
105 | } | ||
106 | |||
107 | static int | 89 | static int |
108 | cifs_read_super(struct super_block *sb) | 90 | cifs_read_super(struct super_block *sb) |
109 | { | 91 | { |
@@ -581,6 +563,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) | |||
581 | mutex_unlock(&dir->i_mutex); | 563 | mutex_unlock(&dir->i_mutex); |
582 | dput(dentry); | 564 | dput(dentry); |
583 | dentry = child; | 565 | dentry = child; |
566 | if (!dentry->d_inode) { | ||
567 | dput(dentry); | ||
568 | dentry = ERR_PTR(-ENOENT); | ||
569 | } | ||
584 | } while (!IS_ERR(dentry)); | 570 | } while (!IS_ERR(dentry)); |
585 | _FreeXid(xid); | 571 | _FreeXid(xid); |
586 | kfree(full_path); | 572 | kfree(full_path); |
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index fbd050c8d52a..cb71dc1f94d1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h | |||
@@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type; | |||
41 | extern const struct address_space_operations cifs_addr_ops; | 41 | extern const struct address_space_operations cifs_addr_ops; |
42 | extern const struct address_space_operations cifs_addr_ops_smallbuf; | 42 | extern const struct address_space_operations cifs_addr_ops_smallbuf; |
43 | 43 | ||
44 | /* Functions related to super block operations */ | ||
45 | extern void cifs_sb_active(struct super_block *sb); | ||
46 | extern void cifs_sb_deactive(struct super_block *sb); | ||
47 | |||
48 | /* Functions related to inodes */ | 44 | /* Functions related to inodes */ |
49 | extern const struct inode_operations cifs_dir_inode_ops; | 45 | extern const struct inode_operations cifs_dir_inode_ops; |
50 | extern struct inode *cifs_root_iget(struct super_block *); | 46 | extern struct inode *cifs_root_iget(struct super_block *); |
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1fcf4e5b3112..38ce6d44b145 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h | |||
@@ -942,8 +942,6 @@ GLOBAL_EXTERN spinlock_t siduidlock; | |||
942 | GLOBAL_EXTERN spinlock_t sidgidlock; | 942 | GLOBAL_EXTERN spinlock_t sidgidlock; |
943 | 943 | ||
944 | void cifs_oplock_break(struct work_struct *work); | 944 | void cifs_oplock_break(struct work_struct *work); |
945 | void cifs_oplock_break_get(struct cifsFileInfo *cfile); | ||
946 | void cifs_oplock_break_put(struct cifsFileInfo *cfile); | ||
947 | 945 | ||
948 | extern const struct slow_work_ops cifs_oplock_break_ops; | 946 | extern const struct slow_work_ops cifs_oplock_break_ops; |
949 | 947 | ||
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1a9fe7f816d1..aac37d99a487 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c | |||
@@ -107,7 +107,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon) | |||
107 | static int | 107 | static int |
108 | cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) | 108 | cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) |
109 | { | 109 | { |
110 | int rc = 0; | 110 | int rc; |
111 | struct cifs_ses *ses; | 111 | struct cifs_ses *ses; |
112 | struct TCP_Server_Info *server; | 112 | struct TCP_Server_Info *server; |
113 | struct nls_table *nls_codepage; | 113 | struct nls_table *nls_codepage; |
@@ -5720,6 +5720,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon, | |||
5720 | char *temp_ptr; | 5720 | char *temp_ptr; |
5721 | char *end_of_smb; | 5721 | char *end_of_smb; |
5722 | __u16 params, byte_count, data_offset; | 5722 | __u16 params, byte_count, data_offset; |
5723 | unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; | ||
5723 | 5724 | ||
5724 | cFYI(1, "In Query All EAs path %s", searchName); | 5725 | cFYI(1, "In Query All EAs path %s", searchName); |
5725 | QAllEAsRetry: | 5726 | QAllEAsRetry: |
@@ -5837,7 +5838,8 @@ QAllEAsRetry: | |||
5837 | } | 5838 | } |
5838 | 5839 | ||
5839 | if (ea_name) { | 5840 | if (ea_name) { |
5840 | if (strncmp(ea_name, temp_ptr, name_len) == 0) { | 5841 | if (ea_name_len == name_len && |
5842 | strncmp(ea_name, temp_ptr, name_len) == 0) { | ||
5841 | temp_ptr += name_len + 1; | 5843 | temp_ptr += name_len + 1; |
5842 | rc = value_len; | 5844 | rc = value_len; |
5843 | if (buf_size == 0) | 5845 | if (buf_size == 0) |
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index e66297bad412..80c2e3add3a2 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c | |||
@@ -319,25 +319,328 @@ requeue_echo: | |||
319 | queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); | 319 | queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); |
320 | } | 320 | } |
321 | 321 | ||
322 | static bool | ||
323 | allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size, | ||
324 | bool is_large_buf) | ||
325 | { | ||
326 | char *bbuf = *bigbuf, *sbuf = *smallbuf; | ||
327 | |||
328 | if (bbuf == NULL) { | ||
329 | bbuf = (char *)cifs_buf_get(); | ||
330 | if (!bbuf) { | ||
331 | cERROR(1, "No memory for large SMB response"); | ||
332 | msleep(3000); | ||
333 | /* retry will check if exiting */ | ||
334 | return false; | ||
335 | } | ||
336 | } else if (is_large_buf) { | ||
337 | /* we are reusing a dirty large buf, clear its start */ | ||
338 | memset(bbuf, 0, size); | ||
339 | } | ||
340 | |||
341 | if (sbuf == NULL) { | ||
342 | sbuf = (char *)cifs_small_buf_get(); | ||
343 | if (!sbuf) { | ||
344 | cERROR(1, "No memory for SMB response"); | ||
345 | msleep(1000); | ||
346 | /* retry will check if exiting */ | ||
347 | return false; | ||
348 | } | ||
349 | /* beginning of smb buffer is cleared in our buf_get */ | ||
350 | } else { | ||
351 | /* if existing small buf clear beginning */ | ||
352 | memset(sbuf, 0, size); | ||
353 | } | ||
354 | |||
355 | *bigbuf = bbuf; | ||
356 | *smallbuf = sbuf; | ||
357 | |||
358 | return true; | ||
359 | } | ||
360 | |||
361 | static int | ||
362 | read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg, | ||
363 | struct kvec *iov, unsigned int to_read, | ||
364 | unsigned int *ptotal_read, bool is_header_read) | ||
365 | { | ||
366 | int length, rc = 0; | ||
367 | unsigned int total_read; | ||
368 | char *buf = iov->iov_base; | ||
369 | |||
370 | for (total_read = 0; total_read < to_read; total_read += length) { | ||
371 | length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1, | ||
372 | to_read - total_read, 0); | ||
373 | if (server->tcpStatus == CifsExiting) { | ||
374 | /* then will exit */ | ||
375 | rc = 2; | ||
376 | break; | ||
377 | } else if (server->tcpStatus == CifsNeedReconnect) { | ||
378 | cifs_reconnect(server); | ||
379 | /* Reconnect wakes up rspns q */ | ||
380 | /* Now we will reread sock */ | ||
381 | rc = 1; | ||
382 | break; | ||
383 | } else if (length == -ERESTARTSYS || | ||
384 | length == -EAGAIN || | ||
385 | length == -EINTR) { | ||
386 | /* | ||
387 | * Minimum sleep to prevent looping, allowing socket | ||
388 | * to clear and app threads to set tcpStatus | ||
389 | * CifsNeedReconnect if server hung. | ||
390 | */ | ||
391 | usleep_range(1000, 2000); | ||
392 | length = 0; | ||
393 | if (!is_header_read) | ||
394 | continue; | ||
395 | /* Special handling for header read */ | ||
396 | if (total_read) { | ||
397 | iov->iov_base = (to_read - total_read) + | ||
398 | buf; | ||
399 | iov->iov_len = to_read - total_read; | ||
400 | smb_msg->msg_control = NULL; | ||
401 | smb_msg->msg_controllen = 0; | ||
402 | rc = 3; | ||
403 | } else | ||
404 | rc = 1; | ||
405 | break; | ||
406 | } else if (length <= 0) { | ||
407 | cERROR(1, "Received no data, expecting %d", | ||
408 | to_read - total_read); | ||
409 | cifs_reconnect(server); | ||
410 | rc = 1; | ||
411 | break; | ||
412 | } | ||
413 | } | ||
414 | |||
415 | *ptotal_read = total_read; | ||
416 | return rc; | ||
417 | } | ||
418 | |||
419 | static bool | ||
420 | check_rfc1002_header(struct TCP_Server_Info *server, char *buf) | ||
421 | { | ||
422 | char temp = *buf; | ||
423 | unsigned int pdu_length = be32_to_cpu( | ||
424 | ((struct smb_hdr *)buf)->smb_buf_length); | ||
425 | |||
426 | /* | ||
427 | * The first byte big endian of the length field, | ||
428 | * is actually not part of the length but the type | ||
429 | * with the most common, zero, as regular data. | ||
430 | */ | ||
431 | if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { | ||
432 | return false; | ||
433 | } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { | ||
434 | cFYI(1, "Good RFC 1002 session rsp"); | ||
435 | return false; | ||
436 | } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { | ||
437 | /* | ||
438 | * We get this from Windows 98 instead of an error on | ||
439 | * SMB negprot response. | ||
440 | */ | ||
441 | cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", | ||
442 | pdu_length); | ||
443 | /* give server a second to clean up */ | ||
444 | msleep(1000); | ||
445 | /* | ||
446 | * Always try 445 first on reconnect since we get NACK | ||
447 | * on some if we ever connected to port 139 (the NACK | ||
448 | * is since we do not begin with RFC1001 session | ||
449 | * initialize frame). | ||
450 | */ | ||
451 | cifs_set_port((struct sockaddr *) | ||
452 | &server->dstaddr, CIFS_PORT); | ||
453 | cifs_reconnect(server); | ||
454 | wake_up(&server->response_q); | ||
455 | return false; | ||
456 | } else if (temp != (char) 0) { | ||
457 | cERROR(1, "Unknown RFC 1002 frame"); | ||
458 | cifs_dump_mem(" Received Data: ", buf, 4); | ||
459 | cifs_reconnect(server); | ||
460 | return false; | ||
461 | } | ||
462 | |||
463 | /* else we have an SMB response */ | ||
464 | if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || | ||
465 | (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { | ||
466 | cERROR(1, "Invalid size SMB length %d pdu_length %d", | ||
467 | 4, pdu_length+4); | ||
468 | cifs_reconnect(server); | ||
469 | wake_up(&server->response_q); | ||
470 | return false; | ||
471 | } | ||
472 | |||
473 | return true; | ||
474 | } | ||
475 | |||
476 | static struct mid_q_entry * | ||
477 | find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf, | ||
478 | int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf) | ||
479 | { | ||
480 | struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL; | ||
481 | |||
482 | spin_lock(&GlobalMid_Lock); | ||
483 | list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) { | ||
484 | if (mid->mid != buf->Mid || | ||
485 | mid->midState != MID_REQUEST_SUBMITTED || | ||
486 | mid->command != buf->Command) | ||
487 | continue; | ||
488 | |||
489 | if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) { | ||
490 | /* We have a multipart transact2 resp */ | ||
491 | *is_multi_rsp = true; | ||
492 | if (mid->resp_buf) { | ||
493 | /* merge response - fix up 1st*/ | ||
494 | *length = coalesce_t2(buf, mid->resp_buf); | ||
495 | if (*length > 0) { | ||
496 | *length = 0; | ||
497 | mid->multiRsp = true; | ||
498 | break; | ||
499 | } | ||
500 | /* All parts received or packet is malformed. */ | ||
501 | mid->multiEnd = true; | ||
502 | goto multi_t2_fnd; | ||
503 | } | ||
504 | if (!is_large_buf) { | ||
505 | /*FIXME: switch to already allocated largebuf?*/ | ||
506 | cERROR(1, "1st trans2 resp needs bigbuf"); | ||
507 | } else { | ||
508 | /* Have first buffer */ | ||
509 | mid->resp_buf = buf; | ||
510 | mid->largeBuf = true; | ||
511 | *bigbuf = NULL; | ||
512 | } | ||
513 | break; | ||
514 | } | ||
515 | mid->resp_buf = buf; | ||
516 | mid->largeBuf = is_large_buf; | ||
517 | multi_t2_fnd: | ||
518 | if (*length == 0) | ||
519 | mid->midState = MID_RESPONSE_RECEIVED; | ||
520 | else | ||
521 | mid->midState = MID_RESPONSE_MALFORMED; | ||
522 | #ifdef CONFIG_CIFS_STATS2 | ||
523 | mid->when_received = jiffies; | ||
524 | #endif | ||
525 | list_del_init(&mid->qhead); | ||
526 | ret = mid; | ||
527 | break; | ||
528 | } | ||
529 | spin_unlock(&GlobalMid_Lock); | ||
530 | |||
531 | return ret; | ||
532 | } | ||
533 | |||
534 | static void clean_demultiplex_info(struct TCP_Server_Info *server) | ||
535 | { | ||
536 | int length; | ||
537 | |||
538 | /* take it off the list, if it's not already */ | ||
539 | spin_lock(&cifs_tcp_ses_lock); | ||
540 | list_del_init(&server->tcp_ses_list); | ||
541 | spin_unlock(&cifs_tcp_ses_lock); | ||
542 | |||
543 | spin_lock(&GlobalMid_Lock); | ||
544 | server->tcpStatus = CifsExiting; | ||
545 | spin_unlock(&GlobalMid_Lock); | ||
546 | wake_up_all(&server->response_q); | ||
547 | |||
548 | /* | ||
549 | * Check if we have blocked requests that need to free. Note that | ||
550 | * cifs_max_pending is normally 50, but can be set at module install | ||
551 | * time to as little as two. | ||
552 | */ | ||
553 | spin_lock(&GlobalMid_Lock); | ||
554 | if (atomic_read(&server->inFlight) >= cifs_max_pending) | ||
555 | atomic_set(&server->inFlight, cifs_max_pending - 1); | ||
556 | /* | ||
557 | * We do not want to set the max_pending too low or we could end up | ||
558 | * with the counter going negative. | ||
559 | */ | ||
560 | spin_unlock(&GlobalMid_Lock); | ||
561 | /* | ||
562 | * Although there should not be any requests blocked on this queue it | ||
563 | * can not hurt to be paranoid and try to wake up requests that may | ||
564 | * haven been blocked when more than 50 at time were on the wire to the | ||
565 | * same server - they now will see the session is in exit state and get | ||
566 | * out of SendReceive. | ||
567 | */ | ||
568 | wake_up_all(&server->request_q); | ||
569 | /* give those requests time to exit */ | ||
570 | msleep(125); | ||
571 | |||
572 | if (server->ssocket) { | ||
573 | sock_release(server->ssocket); | ||
574 | server->ssocket = NULL; | ||
575 | } | ||
576 | |||
577 | if (!list_empty(&server->pending_mid_q)) { | ||
578 | struct list_head dispose_list; | ||
579 | struct mid_q_entry *mid_entry; | ||
580 | struct list_head *tmp, *tmp2; | ||
581 | |||
582 | INIT_LIST_HEAD(&dispose_list); | ||
583 | spin_lock(&GlobalMid_Lock); | ||
584 | list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { | ||
585 | mid_entry = list_entry(tmp, struct mid_q_entry, qhead); | ||
586 | cFYI(1, "Clearing mid 0x%x", mid_entry->mid); | ||
587 | mid_entry->midState = MID_SHUTDOWN; | ||
588 | list_move(&mid_entry->qhead, &dispose_list); | ||
589 | } | ||
590 | spin_unlock(&GlobalMid_Lock); | ||
591 | |||
592 | /* now walk dispose list and issue callbacks */ | ||
593 | list_for_each_safe(tmp, tmp2, &dispose_list) { | ||
594 | mid_entry = list_entry(tmp, struct mid_q_entry, qhead); | ||
595 | cFYI(1, "Callback mid 0x%x", mid_entry->mid); | ||
596 | list_del_init(&mid_entry->qhead); | ||
597 | mid_entry->callback(mid_entry); | ||
598 | } | ||
599 | /* 1/8th of sec is more than enough time for them to exit */ | ||
600 | msleep(125); | ||
601 | } | ||
602 | |||
603 | if (!list_empty(&server->pending_mid_q)) { | ||
604 | /* | ||
605 | * mpx threads have not exited yet give them at least the smb | ||
606 | * send timeout time for long ops. | ||
607 | * | ||
608 | * Due to delays on oplock break requests, we need to wait at | ||
609 | * least 45 seconds before giving up on a request getting a | ||
610 | * response and going ahead and killing cifsd. | ||
611 | */ | ||
612 | cFYI(1, "Wait for exit from demultiplex thread"); | ||
613 | msleep(46000); | ||
614 | /* | ||
615 | * If threads still have not exited they are probably never | ||
616 | * coming home not much else we can do but free the memory. | ||
617 | */ | ||
618 | } | ||
619 | |||
620 | kfree(server->hostname); | ||
621 | kfree(server); | ||
622 | |||
623 | length = atomic_dec_return(&tcpSesAllocCount); | ||
624 | if (length > 0) | ||
625 | mempool_resize(cifs_req_poolp, length + cifs_min_rcv, | ||
626 | GFP_KERNEL); | ||
627 | } | ||
628 | |||
322 | static int | 629 | static int |
323 | cifs_demultiplex_thread(void *p) | 630 | cifs_demultiplex_thread(void *p) |
324 | { | 631 | { |
325 | int length; | 632 | int length; |
326 | struct TCP_Server_Info *server = p; | 633 | struct TCP_Server_Info *server = p; |
327 | unsigned int pdu_length, total_read; | 634 | unsigned int pdu_length, total_read; |
635 | char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL; | ||
328 | struct smb_hdr *smb_buffer = NULL; | 636 | struct smb_hdr *smb_buffer = NULL; |
329 | struct smb_hdr *bigbuf = NULL; | ||
330 | struct smb_hdr *smallbuf = NULL; | ||
331 | struct msghdr smb_msg; | 637 | struct msghdr smb_msg; |
332 | struct kvec iov; | 638 | struct kvec iov; |
333 | struct socket *csocket = server->ssocket; | ||
334 | struct list_head *tmp, *tmp2; | ||
335 | struct task_struct *task_to_wake = NULL; | 639 | struct task_struct *task_to_wake = NULL; |
336 | struct mid_q_entry *mid_entry; | 640 | struct mid_q_entry *mid_entry; |
337 | char temp; | ||
338 | bool isLargeBuf = false; | 641 | bool isLargeBuf = false; |
339 | bool isMultiRsp; | 642 | bool isMultiRsp = false; |
340 | int reconnect; | 643 | int rc; |
341 | 644 | ||
342 | current->flags |= PF_MEMALLOC; | 645 | current->flags |= PF_MEMALLOC; |
343 | cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); | 646 | cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); |
@@ -351,35 +654,16 @@ cifs_demultiplex_thread(void *p) | |||
351 | while (server->tcpStatus != CifsExiting) { | 654 | while (server->tcpStatus != CifsExiting) { |
352 | if (try_to_freeze()) | 655 | if (try_to_freeze()) |
353 | continue; | 656 | continue; |
354 | if (bigbuf == NULL) { | ||
355 | bigbuf = cifs_buf_get(); | ||
356 | if (!bigbuf) { | ||
357 | cERROR(1, "No memory for large SMB response"); | ||
358 | msleep(3000); | ||
359 | /* retry will check if exiting */ | ||
360 | continue; | ||
361 | } | ||
362 | } else if (isLargeBuf) { | ||
363 | /* we are reusing a dirty large buf, clear its start */ | ||
364 | memset(bigbuf, 0, sizeof(struct smb_hdr)); | ||
365 | } | ||
366 | 657 | ||
367 | if (smallbuf == NULL) { | 658 | if (!allocate_buffers(&bigbuf, &smallbuf, |
368 | smallbuf = cifs_small_buf_get(); | 659 | sizeof(struct smb_hdr), isLargeBuf)) |
369 | if (!smallbuf) { | 660 | continue; |
370 | cERROR(1, "No memory for SMB response"); | ||
371 | msleep(1000); | ||
372 | /* retry will check if exiting */ | ||
373 | continue; | ||
374 | } | ||
375 | /* beginning of smb buffer is cleared in our buf_get */ | ||
376 | } else /* if existing small buf clear beginning */ | ||
377 | memset(smallbuf, 0, sizeof(struct smb_hdr)); | ||
378 | 661 | ||
379 | isLargeBuf = false; | 662 | isLargeBuf = false; |
380 | isMultiRsp = false; | 663 | isMultiRsp = false; |
381 | smb_buffer = smallbuf; | 664 | smb_buffer = (struct smb_hdr *)smallbuf; |
382 | iov.iov_base = smb_buffer; | 665 | buf = smallbuf; |
666 | iov.iov_base = buf; | ||
383 | iov.iov_len = 4; | 667 | iov.iov_len = 4; |
384 | smb_msg.msg_control = NULL; | 668 | smb_msg.msg_control = NULL; |
385 | smb_msg.msg_controllen = 0; | 669 | smb_msg.msg_controllen = 0; |
@@ -393,158 +677,50 @@ incomplete_rcv: | |||
393 | "Reconnecting...", server->hostname, | 677 | "Reconnecting...", server->hostname, |
394 | (echo_retries * SMB_ECHO_INTERVAL / HZ)); | 678 | (echo_retries * SMB_ECHO_INTERVAL / HZ)); |
395 | cifs_reconnect(server); | 679 | cifs_reconnect(server); |
396 | csocket = server->ssocket; | ||
397 | wake_up(&server->response_q); | 680 | wake_up(&server->response_q); |
398 | continue; | 681 | continue; |
399 | } | 682 | } |
400 | 683 | ||
401 | length = | 684 | rc = read_from_socket(server, &smb_msg, &iov, pdu_length, |
402 | kernel_recvmsg(csocket, &smb_msg, | 685 | &total_read, true /* header read */); |
403 | &iov, 1, pdu_length, 0 /* BB other flags? */); | 686 | if (rc == 3) |
404 | 687 | goto incomplete_rcv; | |
405 | if (server->tcpStatus == CifsExiting) { | 688 | else if (rc == 2) |
406 | break; | 689 | break; |
407 | } else if (server->tcpStatus == CifsNeedReconnect) { | 690 | else if (rc == 1) |
408 | cFYI(1, "Reconnect after server stopped responding"); | ||
409 | cifs_reconnect(server); | ||
410 | cFYI(1, "call to reconnect done"); | ||
411 | csocket = server->ssocket; | ||
412 | continue; | ||
413 | } else if (length == -ERESTARTSYS || | ||
414 | length == -EAGAIN || | ||
415 | length == -EINTR) { | ||
416 | msleep(1); /* minimum sleep to prevent looping | ||
417 | allowing socket to clear and app threads to set | ||
418 | tcpStatus CifsNeedReconnect if server hung */ | ||
419 | if (pdu_length < 4) { | ||
420 | iov.iov_base = (4 - pdu_length) + | ||
421 | (char *)smb_buffer; | ||
422 | iov.iov_len = pdu_length; | ||
423 | smb_msg.msg_control = NULL; | ||
424 | smb_msg.msg_controllen = 0; | ||
425 | goto incomplete_rcv; | ||
426 | } else | ||
427 | continue; | ||
428 | } else if (length <= 0) { | ||
429 | cFYI(1, "Reconnect after unexpected peek error %d", | ||
430 | length); | ||
431 | cifs_reconnect(server); | ||
432 | csocket = server->ssocket; | ||
433 | wake_up(&server->response_q); | ||
434 | continue; | 691 | continue; |
435 | } else if (length < pdu_length) { | ||
436 | cFYI(1, "requested %d bytes but only got %d bytes", | ||
437 | pdu_length, length); | ||
438 | pdu_length -= length; | ||
439 | msleep(1); | ||
440 | goto incomplete_rcv; | ||
441 | } | ||
442 | |||
443 | /* The right amount was read from socket - 4 bytes */ | ||
444 | /* so we can now interpret the length field */ | ||
445 | 692 | ||
446 | /* the first byte big endian of the length field, | 693 | /* |
447 | is actually not part of the length but the type | 694 | * The right amount was read from socket - 4 bytes, |
448 | with the most common, zero, as regular data */ | 695 | * so we can now interpret the length field. |
449 | temp = *((char *) smb_buffer); | 696 | */ |
450 | 697 | ||
451 | /* Note that FC 1001 length is big endian on the wire, | 698 | /* |
452 | but we convert it here so it is always manipulated | 699 | * Note that RFC 1001 length is big endian on the wire, |
453 | as host byte order */ | 700 | * but we convert it here so it is always manipulated |
701 | * as host byte order. | ||
702 | */ | ||
454 | pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); | 703 | pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); |
455 | 704 | ||
456 | cFYI(1, "rfc1002 length 0x%x", pdu_length+4); | 705 | cFYI(1, "rfc1002 length 0x%x", pdu_length+4); |
457 | 706 | if (!check_rfc1002_header(server, buf)) | |
458 | if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) { | ||
459 | continue; | ||
460 | } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) { | ||
461 | cFYI(1, "Good RFC 1002 session rsp"); | ||
462 | continue; | ||
463 | } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) { | ||
464 | /* we get this from Windows 98 instead of | ||
465 | an error on SMB negprot response */ | ||
466 | cFYI(1, "Negative RFC1002 Session Response Error 0x%x)", | ||
467 | pdu_length); | ||
468 | /* give server a second to clean up */ | ||
469 | msleep(1000); | ||
470 | /* always try 445 first on reconnect since we get NACK | ||
471 | * on some if we ever connected to port 139 (the NACK | ||
472 | * is since we do not begin with RFC1001 session | ||
473 | * initialize frame) | ||
474 | */ | ||
475 | cifs_set_port((struct sockaddr *) | ||
476 | &server->dstaddr, CIFS_PORT); | ||
477 | cifs_reconnect(server); | ||
478 | csocket = server->ssocket; | ||
479 | wake_up(&server->response_q); | ||
480 | continue; | ||
481 | } else if (temp != (char) 0) { | ||
482 | cERROR(1, "Unknown RFC 1002 frame"); | ||
483 | cifs_dump_mem(" Received Data: ", (char *)smb_buffer, | ||
484 | length); | ||
485 | cifs_reconnect(server); | ||
486 | csocket = server->ssocket; | ||
487 | continue; | 707 | continue; |
488 | } | ||
489 | |||
490 | /* else we have an SMB response */ | ||
491 | if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) || | ||
492 | (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) { | ||
493 | cERROR(1, "Invalid size SMB length %d pdu_length %d", | ||
494 | length, pdu_length+4); | ||
495 | cifs_reconnect(server); | ||
496 | csocket = server->ssocket; | ||
497 | wake_up(&server->response_q); | ||
498 | continue; | ||
499 | } | ||
500 | 708 | ||
501 | /* else length ok */ | 709 | /* else length ok */ |
502 | reconnect = 0; | ||
503 | |||
504 | if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { | 710 | if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { |
505 | isLargeBuf = true; | 711 | isLargeBuf = true; |
506 | memcpy(bigbuf, smallbuf, 4); | 712 | memcpy(bigbuf, smallbuf, 4); |
507 | smb_buffer = bigbuf; | 713 | smb_buffer = (struct smb_hdr *)bigbuf; |
714 | buf = bigbuf; | ||
508 | } | 715 | } |
509 | length = 0; | 716 | |
510 | iov.iov_base = 4 + (char *)smb_buffer; | 717 | iov.iov_base = 4 + buf; |
511 | iov.iov_len = pdu_length; | 718 | iov.iov_len = pdu_length; |
512 | for (total_read = 0; total_read < pdu_length; | 719 | rc = read_from_socket(server, &smb_msg, &iov, pdu_length, |
513 | total_read += length) { | 720 | &total_read, false); |
514 | length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, | 721 | if (rc == 2) |
515 | pdu_length - total_read, 0); | ||
516 | if (server->tcpStatus == CifsExiting) { | ||
517 | /* then will exit */ | ||
518 | reconnect = 2; | ||
519 | break; | ||
520 | } else if (server->tcpStatus == CifsNeedReconnect) { | ||
521 | cifs_reconnect(server); | ||
522 | csocket = server->ssocket; | ||
523 | /* Reconnect wakes up rspns q */ | ||
524 | /* Now we will reread sock */ | ||
525 | reconnect = 1; | ||
526 | break; | ||
527 | } else if (length == -ERESTARTSYS || | ||
528 | length == -EAGAIN || | ||
529 | length == -EINTR) { | ||
530 | msleep(1); /* minimum sleep to prevent looping, | ||
531 | allowing socket to clear and app | ||
532 | threads to set tcpStatus | ||
533 | CifsNeedReconnect if server hung*/ | ||
534 | length = 0; | ||
535 | continue; | ||
536 | } else if (length <= 0) { | ||
537 | cERROR(1, "Received no data, expecting %d", | ||
538 | pdu_length - total_read); | ||
539 | cifs_reconnect(server); | ||
540 | csocket = server->ssocket; | ||
541 | reconnect = 1; | ||
542 | break; | ||
543 | } | ||
544 | } | ||
545 | if (reconnect == 2) | ||
546 | break; | 722 | break; |
547 | else if (reconnect == 1) | 723 | else if (rc == 1) |
548 | continue; | 724 | continue; |
549 | 725 | ||
550 | total_read += 4; /* account for rfc1002 hdr */ | 726 | total_read += 4; /* account for rfc1002 hdr */ |
@@ -562,75 +738,13 @@ incomplete_rcv: | |||
562 | */ | 738 | */ |
563 | length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); | 739 | length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); |
564 | if (length != 0) | 740 | if (length != 0) |
565 | cifs_dump_mem("Bad SMB: ", smb_buffer, | 741 | cifs_dump_mem("Bad SMB: ", buf, |
566 | min_t(unsigned int, total_read, 48)); | 742 | min_t(unsigned int, total_read, 48)); |
567 | 743 | ||
568 | mid_entry = NULL; | ||
569 | server->lstrp = jiffies; | 744 | server->lstrp = jiffies; |
570 | 745 | ||
571 | spin_lock(&GlobalMid_Lock); | 746 | mid_entry = find_cifs_mid(server, smb_buffer, &length, |
572 | list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { | 747 | isLargeBuf, &isMultiRsp, &bigbuf); |
573 | mid_entry = list_entry(tmp, struct mid_q_entry, qhead); | ||
574 | |||
575 | if (mid_entry->mid != smb_buffer->Mid || | ||
576 | mid_entry->midState != MID_REQUEST_SUBMITTED || | ||
577 | mid_entry->command != smb_buffer->Command) { | ||
578 | mid_entry = NULL; | ||
579 | continue; | ||
580 | } | ||
581 | |||
582 | if (length == 0 && | ||
583 | check2ndT2(smb_buffer, server->maxBuf) > 0) { | ||
584 | /* We have a multipart transact2 resp */ | ||
585 | isMultiRsp = true; | ||
586 | if (mid_entry->resp_buf) { | ||
587 | /* merge response - fix up 1st*/ | ||
588 | length = coalesce_t2(smb_buffer, | ||
589 | mid_entry->resp_buf); | ||
590 | if (length > 0) { | ||
591 | length = 0; | ||
592 | mid_entry->multiRsp = true; | ||
593 | break; | ||
594 | } else { | ||
595 | /* all parts received or | ||
596 | * packet is malformed | ||
597 | */ | ||
598 | mid_entry->multiEnd = true; | ||
599 | goto multi_t2_fnd; | ||
600 | } | ||
601 | } else { | ||
602 | if (!isLargeBuf) { | ||
603 | /* | ||
604 | * FIXME: switch to already | ||
605 | * allocated largebuf? | ||
606 | */ | ||
607 | cERROR(1, "1st trans2 resp " | ||
608 | "needs bigbuf"); | ||
609 | } else { | ||
610 | /* Have first buffer */ | ||
611 | mid_entry->resp_buf = | ||
612 | smb_buffer; | ||
613 | mid_entry->largeBuf = true; | ||
614 | bigbuf = NULL; | ||
615 | } | ||
616 | } | ||
617 | break; | ||
618 | } | ||
619 | mid_entry->resp_buf = smb_buffer; | ||
620 | mid_entry->largeBuf = isLargeBuf; | ||
621 | multi_t2_fnd: | ||
622 | if (length == 0) | ||
623 | mid_entry->midState = MID_RESPONSE_RECEIVED; | ||
624 | else | ||
625 | mid_entry->midState = MID_RESPONSE_MALFORMED; | ||
626 | #ifdef CONFIG_CIFS_STATS2 | ||
627 | mid_entry->when_received = jiffies; | ||
628 | #endif | ||
629 | list_del_init(&mid_entry->qhead); | ||
630 | break; | ||
631 | } | ||
632 | spin_unlock(&GlobalMid_Lock); | ||
633 | |||
634 | if (mid_entry != NULL) { | 748 | if (mid_entry != NULL) { |
635 | mid_entry->callback(mid_entry); | 749 | mid_entry->callback(mid_entry); |
636 | /* Was previous buf put in mpx struct for multi-rsp? */ | 750 | /* Was previous buf put in mpx struct for multi-rsp? */ |
@@ -648,7 +762,7 @@ multi_t2_fnd: | |||
648 | !isMultiRsp) { | 762 | !isMultiRsp) { |
649 | cERROR(1, "No task to wake, unknown frame received! " | 763 | cERROR(1, "No task to wake, unknown frame received! " |
650 | "NumMids %d", atomic_read(&midCount)); | 764 | "NumMids %d", atomic_read(&midCount)); |
651 | cifs_dump_mem("Received Data is: ", (char *)smb_buffer, | 765 | cifs_dump_mem("Received Data is: ", buf, |
652 | sizeof(struct smb_hdr)); | 766 | sizeof(struct smb_hdr)); |
653 | #ifdef CONFIG_CIFS_DEBUG2 | 767 | #ifdef CONFIG_CIFS_DEBUG2 |
654 | cifs_dump_detail(smb_buffer); | 768 | cifs_dump_detail(smb_buffer); |
@@ -658,88 +772,13 @@ multi_t2_fnd: | |||
658 | } | 772 | } |
659 | } /* end while !EXITING */ | 773 | } /* end while !EXITING */ |
660 | 774 | ||
661 | /* take it off the list, if it's not already */ | ||
662 | spin_lock(&cifs_tcp_ses_lock); | ||
663 | list_del_init(&server->tcp_ses_list); | ||
664 | spin_unlock(&cifs_tcp_ses_lock); | ||
665 | |||
666 | spin_lock(&GlobalMid_Lock); | ||
667 | server->tcpStatus = CifsExiting; | ||
668 | spin_unlock(&GlobalMid_Lock); | ||
669 | wake_up_all(&server->response_q); | ||
670 | |||
671 | /* check if we have blocked requests that need to free */ | ||
672 | /* Note that cifs_max_pending is normally 50, but | ||
673 | can be set at module install time to as little as two */ | ||
674 | spin_lock(&GlobalMid_Lock); | ||
675 | if (atomic_read(&server->inFlight) >= cifs_max_pending) | ||
676 | atomic_set(&server->inFlight, cifs_max_pending - 1); | ||
677 | /* We do not want to set the max_pending too low or we | ||
678 | could end up with the counter going negative */ | ||
679 | spin_unlock(&GlobalMid_Lock); | ||
680 | /* Although there should not be any requests blocked on | ||
681 | this queue it can not hurt to be paranoid and try to wake up requests | ||
682 | that may haven been blocked when more than 50 at time were on the wire | ||
683 | to the same server - they now will see the session is in exit state | ||
684 | and get out of SendReceive. */ | ||
685 | wake_up_all(&server->request_q); | ||
686 | /* give those requests time to exit */ | ||
687 | msleep(125); | ||
688 | |||
689 | if (server->ssocket) { | ||
690 | sock_release(csocket); | ||
691 | server->ssocket = NULL; | ||
692 | } | ||
693 | /* buffer usually freed in free_mid - need to free it here on exit */ | 775 | /* buffer usually freed in free_mid - need to free it here on exit */ |
694 | cifs_buf_release(bigbuf); | 776 | cifs_buf_release(bigbuf); |
695 | if (smallbuf) /* no sense logging a debug message if NULL */ | 777 | if (smallbuf) /* no sense logging a debug message if NULL */ |
696 | cifs_small_buf_release(smallbuf); | 778 | cifs_small_buf_release(smallbuf); |
697 | 779 | ||
698 | if (!list_empty(&server->pending_mid_q)) { | ||
699 | struct list_head dispose_list; | ||
700 | |||
701 | INIT_LIST_HEAD(&dispose_list); | ||
702 | spin_lock(&GlobalMid_Lock); | ||
703 | list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { | ||
704 | mid_entry = list_entry(tmp, struct mid_q_entry, qhead); | ||
705 | cFYI(1, "Clearing mid 0x%x", mid_entry->mid); | ||
706 | mid_entry->midState = MID_SHUTDOWN; | ||
707 | list_move(&mid_entry->qhead, &dispose_list); | ||
708 | } | ||
709 | spin_unlock(&GlobalMid_Lock); | ||
710 | |||
711 | /* now walk dispose list and issue callbacks */ | ||
712 | list_for_each_safe(tmp, tmp2, &dispose_list) { | ||
713 | mid_entry = list_entry(tmp, struct mid_q_entry, qhead); | ||
714 | cFYI(1, "Callback mid 0x%x", mid_entry->mid); | ||
715 | list_del_init(&mid_entry->qhead); | ||
716 | mid_entry->callback(mid_entry); | ||
717 | } | ||
718 | /* 1/8th of sec is more than enough time for them to exit */ | ||
719 | msleep(125); | ||
720 | } | ||
721 | |||
722 | if (!list_empty(&server->pending_mid_q)) { | ||
723 | /* mpx threads have not exited yet give them | ||
724 | at least the smb send timeout time for long ops */ | ||
725 | /* due to delays on oplock break requests, we need | ||
726 | to wait at least 45 seconds before giving up | ||
727 | on a request getting a response and going ahead | ||
728 | and killing cifsd */ | ||
729 | cFYI(1, "Wait for exit from demultiplex thread"); | ||
730 | msleep(46000); | ||
731 | /* if threads still have not exited they are probably never | ||
732 | coming home not much else we can do but free the memory */ | ||
733 | } | ||
734 | |||
735 | kfree(server->hostname); | ||
736 | task_to_wake = xchg(&server->tsk, NULL); | 780 | task_to_wake = xchg(&server->tsk, NULL); |
737 | kfree(server); | 781 | clean_demultiplex_info(server); |
738 | |||
739 | length = atomic_dec_return(&tcpSesAllocCount); | ||
740 | if (length > 0) | ||
741 | mempool_resize(cifs_req_poolp, length + cifs_min_rcv, | ||
742 | GFP_KERNEL); | ||
743 | 782 | ||
744 | /* if server->tsk was NULL then wait for a signal before exiting */ | 783 | /* if server->tsk was NULL then wait for a signal before exiting */ |
745 | if (!task_to_wake) { | 784 | if (!task_to_wake) { |
@@ -3193,15 +3232,9 @@ mount_fail_check: | |||
3193 | else | 3232 | else |
3194 | cifs_put_tcp_session(srvTcp); | 3233 | cifs_put_tcp_session(srvTcp); |
3195 | bdi_destroy(&cifs_sb->bdi); | 3234 | bdi_destroy(&cifs_sb->bdi); |
3196 | goto out; | ||
3197 | } | 3235 | } |
3198 | 3236 | ||
3199 | /* volume_info->password is freed above when existing session found | ||
3200 | (in which case it is not needed anymore) but when new sesion is created | ||
3201 | the password ptr is put in the new session structure (in which case the | ||
3202 | password will be freed at unmount time) */ | ||
3203 | out: | 3237 | out: |
3204 | /* zero out password before freeing */ | ||
3205 | FreeXid(xid); | 3238 | FreeXid(xid); |
3206 | return rc; | 3239 | return rc; |
3207 | } | 3240 | } |
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 548f06230a6d..1d2d91d9bf65 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c | |||
@@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) | |||
79 | /* Perform the upcall */ | 79 | /* Perform the upcall */ |
80 | rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); | 80 | rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); |
81 | if (rc < 0) | 81 | if (rc < 0) |
82 | cERROR(1, "%s: unable to resolve: %*.*s", | 82 | cFYI(1, "%s: unable to resolve: %*.*s", |
83 | __func__, len, len, hostname); | 83 | __func__, len, len, hostname); |
84 | else | 84 | else |
85 | cFYI(1, "%s: resolved: %*.*s to %s", | 85 | cFYI(1, "%s: resolved: %*.*s to %s", |
86 | __func__, len, len, hostname, *ip_addr); | 86 | __func__, len, len, hostname, *ip_addr); |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 378acdafa356..9f41a10523a1 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -314,6 +314,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) | |||
314 | } | 314 | } |
315 | spin_unlock(&cifs_file_list_lock); | 315 | spin_unlock(&cifs_file_list_lock); |
316 | 316 | ||
317 | cancel_work_sync(&cifs_file->oplock_break); | ||
318 | |||
317 | if (!tcon->need_reconnect && !cifs_file->invalidHandle) { | 319 | if (!tcon->need_reconnect && !cifs_file->invalidHandle) { |
318 | int xid, rc; | 320 | int xid, rc; |
319 | 321 | ||
@@ -2418,31 +2420,6 @@ void cifs_oplock_break(struct work_struct *work) | |||
2418 | cinode->clientCanCacheRead ? 1 : 0); | 2420 | cinode->clientCanCacheRead ? 1 : 0); |
2419 | cFYI(1, "Oplock release rc = %d", rc); | 2421 | cFYI(1, "Oplock release rc = %d", rc); |
2420 | } | 2422 | } |
2421 | |||
2422 | /* | ||
2423 | * We might have kicked in before is_valid_oplock_break() | ||
2424 | * finished grabbing reference for us. Make sure it's done by | ||
2425 | * waiting for cifs_file_list_lock. | ||
2426 | */ | ||
2427 | spin_lock(&cifs_file_list_lock); | ||
2428 | spin_unlock(&cifs_file_list_lock); | ||
2429 | |||
2430 | cifs_oplock_break_put(cfile); | ||
2431 | } | ||
2432 | |||
2433 | /* must be called while holding cifs_file_list_lock */ | ||
2434 | void cifs_oplock_break_get(struct cifsFileInfo *cfile) | ||
2435 | { | ||
2436 | cifs_sb_active(cfile->dentry->d_sb); | ||
2437 | cifsFileInfo_get(cfile); | ||
2438 | } | ||
2439 | |||
2440 | void cifs_oplock_break_put(struct cifsFileInfo *cfile) | ||
2441 | { | ||
2442 | struct super_block *sb = cfile->dentry->d_sb; | ||
2443 | |||
2444 | cifsFileInfo_put(cfile); | ||
2445 | cifs_sb_deactive(sb); | ||
2446 | } | 2423 | } |
2447 | 2424 | ||
2448 | const struct address_space_operations cifs_addr_ops = { | 2425 | const struct address_space_operations cifs_addr_ops = { |
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 9b018c8334fa..a7b2dcd4a53e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c | |||
@@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, | |||
764 | if (full_path == NULL) | 764 | if (full_path == NULL) |
765 | return full_path; | 765 | return full_path; |
766 | 766 | ||
767 | if (dfsplen) { | 767 | if (dfsplen) |
768 | strncpy(full_path, tcon->treeName, dfsplen); | 768 | strncpy(full_path, tcon->treeName, dfsplen); |
769 | /* switch slash direction in prepath depending on whether | ||
770 | * windows or posix style path names | ||
771 | */ | ||
772 | if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { | ||
773 | int i; | ||
774 | for (i = 0; i < dfsplen; i++) { | ||
775 | if (full_path[i] == '\\') | ||
776 | full_path[i] = '/'; | ||
777 | } | ||
778 | } | ||
779 | } | ||
780 | strncpy(full_path + dfsplen, vol->prepath, pplen); | 769 | strncpy(full_path + dfsplen, vol->prepath, pplen); |
770 | convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb)); | ||
781 | full_path[dfsplen + pplen] = 0; /* add trailing null */ | 771 | full_path[dfsplen + pplen] = 0; /* add trailing null */ |
782 | return full_path; | 772 | return full_path; |
783 | } | 773 | } |
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 03a1f491d39b..7c1693392598 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c | |||
@@ -585,15 +585,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) | |||
585 | 585 | ||
586 | cifs_set_oplock_level(pCifsInode, | 586 | cifs_set_oplock_level(pCifsInode, |
587 | pSMB->OplockLevel ? OPLOCK_READ : 0); | 587 | pSMB->OplockLevel ? OPLOCK_READ : 0); |
588 | /* | 588 | queue_work(system_nrt_wq, |
589 | * cifs_oplock_break_put() can't be called | 589 | &netfile->oplock_break); |
590 | * from here. Get reference after queueing | ||
591 | * succeeded. cifs_oplock_break() will | ||
592 | * synchronize using cifs_file_list_lock. | ||
593 | */ | ||
594 | if (queue_work(system_nrt_wq, | ||
595 | &netfile->oplock_break)) | ||
596 | cifs_oplock_break_get(netfile); | ||
597 | netfile->oplock_break_cancelled = false; | 590 | netfile->oplock_break_cancelled = false; |
598 | 591 | ||
599 | spin_unlock(&cifs_file_list_lock); | 592 | spin_unlock(&cifs_file_list_lock); |
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 147aa22c3c3a..c1b9c4b10739 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c | |||
@@ -362,6 +362,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov, | |||
362 | mid = AllocMidQEntry(hdr, server); | 362 | mid = AllocMidQEntry(hdr, server); |
363 | if (mid == NULL) { | 363 | if (mid == NULL) { |
364 | mutex_unlock(&server->srv_mutex); | 364 | mutex_unlock(&server->srv_mutex); |
365 | atomic_dec(&server->inFlight); | ||
366 | wake_up(&server->request_q); | ||
365 | return -ENOMEM; | 367 | return -ENOMEM; |
366 | } | 368 | } |
367 | 369 | ||
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 8be086e9abe4..51352de88ef1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c | |||
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT) | |||
1003 | COMPATIBLE_IOCTL(PPPIOCDISCONN) | 1003 | COMPATIBLE_IOCTL(PPPIOCDISCONN) |
1004 | COMPATIBLE_IOCTL(PPPIOCATTCHAN) | 1004 | COMPATIBLE_IOCTL(PPPIOCATTCHAN) |
1005 | COMPATIBLE_IOCTL(PPPIOCGCHAN) | 1005 | COMPATIBLE_IOCTL(PPPIOCGCHAN) |
1006 | COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS) | ||
1006 | /* PPPOX */ | 1007 | /* PPPOX */ |
1007 | COMPATIBLE_IOCTL(PPPOEIOCSFWD) | 1008 | COMPATIBLE_IOCTL(PPPOEIOCSFWD) |
1008 | COMPATIBLE_IOCTL(PPPOEIOCDFWD) | 1009 | COMPATIBLE_IOCTL(PPPOEIOCDFWD) |
diff --git a/fs/dcache.c b/fs/dcache.c index b05aac3a8cfc..a88948b8bd17 100644 --- a/fs/dcache.c +++ b/fs/dcache.c | |||
@@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) | |||
301 | return parent; | 301 | return parent; |
302 | } | 302 | } |
303 | 303 | ||
304 | /* | ||
305 | * Unhash a dentry without inserting an RCU walk barrier or checking that | ||
306 | * dentry->d_lock is locked. The caller must take care of that, if | ||
307 | * appropriate. | ||
308 | */ | ||
309 | static void __d_shrink(struct dentry *dentry) | ||
310 | { | ||
311 | if (!d_unhashed(dentry)) { | ||
312 | struct hlist_bl_head *b; | ||
313 | if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) | ||
314 | b = &dentry->d_sb->s_anon; | ||
315 | else | ||
316 | b = d_hash(dentry->d_parent, dentry->d_name.hash); | ||
317 | |||
318 | hlist_bl_lock(b); | ||
319 | __hlist_bl_del(&dentry->d_hash); | ||
320 | dentry->d_hash.pprev = NULL; | ||
321 | hlist_bl_unlock(b); | ||
322 | } | ||
323 | } | ||
324 | |||
304 | /** | 325 | /** |
305 | * d_drop - drop a dentry | 326 | * d_drop - drop a dentry |
306 | * @dentry: dentry to drop | 327 | * @dentry: dentry to drop |
@@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) | |||
319 | void __d_drop(struct dentry *dentry) | 340 | void __d_drop(struct dentry *dentry) |
320 | { | 341 | { |
321 | if (!d_unhashed(dentry)) { | 342 | if (!d_unhashed(dentry)) { |
322 | struct hlist_bl_head *b; | 343 | __d_shrink(dentry); |
323 | if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) | ||
324 | b = &dentry->d_sb->s_anon; | ||
325 | else | ||
326 | b = d_hash(dentry->d_parent, dentry->d_name.hash); | ||
327 | |||
328 | hlist_bl_lock(b); | ||
329 | __hlist_bl_del(&dentry->d_hash); | ||
330 | dentry->d_hash.pprev = NULL; | ||
331 | hlist_bl_unlock(b); | ||
332 | |||
333 | dentry_rcuwalk_barrier(dentry); | 344 | dentry_rcuwalk_barrier(dentry); |
334 | } | 345 | } |
335 | } | 346 | } |
@@ -784,6 +795,7 @@ relock: | |||
784 | 795 | ||
785 | /** | 796 | /** |
786 | * prune_dcache_sb - shrink the dcache | 797 | * prune_dcache_sb - shrink the dcache |
798 | * @sb: superblock | ||
787 | * @nr_to_scan: number of entries to try to free | 799 | * @nr_to_scan: number of entries to try to free |
788 | * | 800 | * |
789 | * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is | 801 | * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is |
@@ -828,44 +840,24 @@ EXPORT_SYMBOL(shrink_dcache_sb); | |||
828 | static void shrink_dcache_for_umount_subtree(struct dentry *dentry) | 840 | static void shrink_dcache_for_umount_subtree(struct dentry *dentry) |
829 | { | 841 | { |
830 | struct dentry *parent; | 842 | struct dentry *parent; |
831 | unsigned detached = 0; | ||
832 | 843 | ||
833 | BUG_ON(!IS_ROOT(dentry)); | 844 | BUG_ON(!IS_ROOT(dentry)); |
834 | 845 | ||
835 | /* detach this root from the system */ | ||
836 | spin_lock(&dentry->d_lock); | ||
837 | dentry_lru_del(dentry); | ||
838 | __d_drop(dentry); | ||
839 | spin_unlock(&dentry->d_lock); | ||
840 | |||
841 | for (;;) { | 846 | for (;;) { |
842 | /* descend to the first leaf in the current subtree */ | 847 | /* descend to the first leaf in the current subtree */ |
843 | while (!list_empty(&dentry->d_subdirs)) { | 848 | while (!list_empty(&dentry->d_subdirs)) |
844 | struct dentry *loop; | ||
845 | |||
846 | /* this is a branch with children - detach all of them | ||
847 | * from the system in one go */ | ||
848 | spin_lock(&dentry->d_lock); | ||
849 | list_for_each_entry(loop, &dentry->d_subdirs, | ||
850 | d_u.d_child) { | ||
851 | spin_lock_nested(&loop->d_lock, | ||
852 | DENTRY_D_LOCK_NESTED); | ||
853 | dentry_lru_del(loop); | ||
854 | __d_drop(loop); | ||
855 | spin_unlock(&loop->d_lock); | ||
856 | } | ||
857 | spin_unlock(&dentry->d_lock); | ||
858 | |||
859 | /* move to the first child */ | ||
860 | dentry = list_entry(dentry->d_subdirs.next, | 849 | dentry = list_entry(dentry->d_subdirs.next, |
861 | struct dentry, d_u.d_child); | 850 | struct dentry, d_u.d_child); |
862 | } | ||
863 | 851 | ||
864 | /* consume the dentries from this leaf up through its parents | 852 | /* consume the dentries from this leaf up through its parents |
865 | * until we find one with children or run out altogether */ | 853 | * until we find one with children or run out altogether */ |
866 | do { | 854 | do { |
867 | struct inode *inode; | 855 | struct inode *inode; |
868 | 856 | ||
857 | /* detach from the system */ | ||
858 | dentry_lru_del(dentry); | ||
859 | __d_shrink(dentry); | ||
860 | |||
869 | if (dentry->d_count != 0) { | 861 | if (dentry->d_count != 0) { |
870 | printk(KERN_ERR | 862 | printk(KERN_ERR |
871 | "BUG: Dentry %p{i=%lx,n=%s}" | 863 | "BUG: Dentry %p{i=%lx,n=%s}" |
@@ -886,14 +878,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry) | |||
886 | list_del(&dentry->d_u.d_child); | 878 | list_del(&dentry->d_u.d_child); |
887 | } else { | 879 | } else { |
888 | parent = dentry->d_parent; | 880 | parent = dentry->d_parent; |
889 | spin_lock(&parent->d_lock); | ||
890 | parent->d_count--; | 881 | parent->d_count--; |
891 | list_del(&dentry->d_u.d_child); | 882 | list_del(&dentry->d_u.d_child); |
892 | spin_unlock(&parent->d_lock); | ||
893 | } | 883 | } |
894 | 884 | ||
895 | detached++; | ||
896 | |||
897 | inode = dentry->d_inode; | 885 | inode = dentry->d_inode; |
898 | if (inode) { | 886 | if (inode) { |
899 | dentry->d_inode = NULL; | 887 | dentry->d_inode = NULL; |
@@ -938,9 +926,7 @@ void shrink_dcache_for_umount(struct super_block *sb) | |||
938 | 926 | ||
939 | dentry = sb->s_root; | 927 | dentry = sb->s_root; |
940 | sb->s_root = NULL; | 928 | sb->s_root = NULL; |
941 | spin_lock(&dentry->d_lock); | ||
942 | dentry->d_count--; | 929 | dentry->d_count--; |
943 | spin_unlock(&dentry->d_lock); | ||
944 | shrink_dcache_for_umount_subtree(dentry); | 930 | shrink_dcache_for_umount_subtree(dentry); |
945 | 931 | ||
946 | while (!hlist_bl_empty(&sb->s_anon)) { | 932 | while (!hlist_bl_empty(&sb->s_anon)) { |
@@ -1743,7 +1729,7 @@ seqretry: | |||
1743 | */ | 1729 | */ |
1744 | if (read_seqcount_retry(&dentry->d_seq, *seq)) | 1730 | if (read_seqcount_retry(&dentry->d_seq, *seq)) |
1745 | goto seqretry; | 1731 | goto seqretry; |
1746 | if (parent->d_flags & DCACHE_OP_COMPARE) { | 1732 | if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { |
1747 | if (parent->d_op->d_compare(parent, *inode, | 1733 | if (parent->d_op->d_compare(parent, *inode, |
1748 | dentry, i, | 1734 | dentry, i, |
1749 | tlen, tname, name)) | 1735 | tlen, tname, name)) |
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 2d0f757fda3e..c5a5855a6c44 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild | |||
@@ -12,5 +12,8 @@ | |||
12 | # Kbuild - Gets included from the Kernels Makefile and build system | 12 | # Kbuild - Gets included from the Kernels Makefile and build system |
13 | # | 13 | # |
14 | 14 | ||
15 | exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o | 15 | # ore module library |
16 | obj-$(CONFIG_ORE) += ore.o | ||
17 | |||
18 | exofs-y := inode.o file.o symlink.o namei.o dir.o super.o | ||
16 | obj-$(CONFIG_EXOFS_FS) += exofs.o | 19 | obj-$(CONFIG_EXOFS_FS) += exofs.o |
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig index 86194b2f799d..70bae4149291 100644 --- a/fs/exofs/Kconfig +++ b/fs/exofs/Kconfig | |||
@@ -1,6 +1,10 @@ | |||
1 | config ORE | ||
2 | tristate | ||
3 | |||
1 | config EXOFS_FS | 4 | config EXOFS_FS |
2 | tristate "exofs: OSD based file system support" | 5 | tristate "exofs: OSD based file system support" |
3 | depends on SCSI_OSD_ULD | 6 | depends on SCSI_OSD_ULD |
7 | select ORE | ||
4 | help | 8 | help |
5 | EXOFS is a file system that uses an OSD storage device, | 9 | EXOFS is a file system that uses an OSD storage device, |
6 | as its backing storage. | 10 | as its backing storage. |
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index c965806c2821..f4e442ec7445 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h | |||
@@ -36,12 +36,9 @@ | |||
36 | #include <linux/fs.h> | 36 | #include <linux/fs.h> |
37 | #include <linux/time.h> | 37 | #include <linux/time.h> |
38 | #include <linux/backing-dev.h> | 38 | #include <linux/backing-dev.h> |
39 | #include "common.h" | 39 | #include <scsi/osd_ore.h> |
40 | 40 | ||
41 | /* FIXME: Remove once pnfs hits mainline | 41 | #include "common.h" |
42 | * #include <linux/exportfs/pnfs_osd_xdr.h> | ||
43 | */ | ||
44 | #include "pnfs.h" | ||
45 | 42 | ||
46 | #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) | 43 | #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) |
47 | 44 | ||
@@ -56,27 +53,11 @@ | |||
56 | /* u64 has problems with printk this will cast it to unsigned long long */ | 53 | /* u64 has problems with printk this will cast it to unsigned long long */ |
57 | #define _LLU(x) (unsigned long long)(x) | 54 | #define _LLU(x) (unsigned long long)(x) |
58 | 55 | ||
59 | struct exofs_layout { | ||
60 | osd_id s_pid; /* partition ID of file system*/ | ||
61 | |||
62 | /* Our way of looking at the data_map */ | ||
63 | unsigned stripe_unit; | ||
64 | unsigned mirrors_p1; | ||
65 | |||
66 | unsigned group_width; | ||
67 | u64 group_depth; | ||
68 | unsigned group_count; | ||
69 | |||
70 | enum exofs_inode_layout_gen_functions lay_func; | ||
71 | |||
72 | unsigned s_numdevs; /* Num of devices in array */ | ||
73 | struct osd_dev *s_ods[0]; /* Variable length */ | ||
74 | }; | ||
75 | |||
76 | /* | 56 | /* |
77 | * our extension to the in-memory superblock | 57 | * our extension to the in-memory superblock |
78 | */ | 58 | */ |
79 | struct exofs_sb_info { | 59 | struct exofs_sb_info { |
60 | struct backing_dev_info bdi; /* register our bdi with VFS */ | ||
80 | struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ | 61 | struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ |
81 | int s_timeout; /* timeout for OSD operations */ | 62 | int s_timeout; /* timeout for OSD operations */ |
82 | uint64_t s_nextid; /* highest object ID used */ | 63 | uint64_t s_nextid; /* highest object ID used */ |
@@ -84,16 +65,13 @@ struct exofs_sb_info { | |||
84 | spinlock_t s_next_gen_lock; /* spinlock for gen # update */ | 65 | spinlock_t s_next_gen_lock; /* spinlock for gen # update */ |
85 | u32 s_next_generation; /* next gen # to use */ | 66 | u32 s_next_generation; /* next gen # to use */ |
86 | atomic_t s_curr_pending; /* number of pending commands */ | 67 | atomic_t s_curr_pending; /* number of pending commands */ |
87 | uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ | ||
88 | struct backing_dev_info bdi; /* register our bdi with VFS */ | ||
89 | 68 | ||
90 | struct pnfs_osd_data_map data_map; /* Default raid to use | 69 | struct pnfs_osd_data_map data_map; /* Default raid to use |
91 | * FIXME: Needed ? | 70 | * FIXME: Needed ? |
92 | */ | 71 | */ |
93 | /* struct exofs_layout dir_layout;*/ /* Default dir layout */ | 72 | struct ore_layout layout; /* Default files layout */ |
94 | struct exofs_layout layout; /* Default files layout, | 73 | struct ore_comp one_comp; /* id & cred of partition id=0*/ |
95 | * contains the variable osd_dev | 74 | struct ore_components comps; /* comps for the partition */ |
96 | * array. Keep last */ | ||
97 | struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ | 75 | struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ |
98 | }; | 76 | }; |
99 | 77 | ||
@@ -107,7 +85,8 @@ struct exofs_i_info { | |||
107 | uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ | 85 | uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ |
108 | uint32_t i_dir_start_lookup; /* which page to start lookup */ | 86 | uint32_t i_dir_start_lookup; /* which page to start lookup */ |
109 | uint64_t i_commit_size; /* the object's written length */ | 87 | uint64_t i_commit_size; /* the object's written length */ |
110 | uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ | 88 | struct ore_comp one_comp; /* same component for all devices */ |
89 | struct ore_components comps; /* inode view of the device table */ | ||
111 | }; | 90 | }; |
112 | 91 | ||
113 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) | 92 | static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) |
@@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) | |||
115 | return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; | 94 | return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; |
116 | } | 95 | } |
117 | 96 | ||
118 | struct exofs_io_state; | ||
119 | typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); | ||
120 | |||
121 | struct exofs_io_state { | ||
122 | struct kref kref; | ||
123 | |||
124 | void *private; | ||
125 | exofs_io_done_fn done; | ||
126 | |||
127 | struct exofs_layout *layout; | ||
128 | struct osd_obj_id obj; | ||
129 | u8 *cred; | ||
130 | |||
131 | /* Global read/write IO*/ | ||
132 | loff_t offset; | ||
133 | unsigned long length; | ||
134 | void *kern_buff; | ||
135 | |||
136 | struct page **pages; | ||
137 | unsigned nr_pages; | ||
138 | unsigned pgbase; | ||
139 | unsigned pages_consumed; | ||
140 | |||
141 | /* Attributes */ | ||
142 | unsigned in_attr_len; | ||
143 | struct osd_attr *in_attr; | ||
144 | unsigned out_attr_len; | ||
145 | struct osd_attr *out_attr; | ||
146 | |||
147 | /* Variable array of size numdevs */ | ||
148 | unsigned numdevs; | ||
149 | struct exofs_per_dev_state { | ||
150 | struct osd_request *or; | ||
151 | struct bio *bio; | ||
152 | loff_t offset; | ||
153 | unsigned length; | ||
154 | unsigned dev; | ||
155 | } per_dev[]; | ||
156 | }; | ||
157 | |||
158 | static inline unsigned exofs_io_state_size(unsigned numdevs) | ||
159 | { | ||
160 | return sizeof(struct exofs_io_state) + | ||
161 | sizeof(struct exofs_per_dev_state) * numdevs; | ||
162 | } | ||
163 | |||
164 | /* | 97 | /* |
165 | * our inode flags | 98 | * our inode flags |
166 | */ | 99 | */ |
@@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) | |||
205 | } | 138 | } |
206 | 139 | ||
207 | /* | 140 | /* |
208 | * Given a layout, object_number and stripe_index return the associated global | ||
209 | * dev_index | ||
210 | */ | ||
211 | unsigned exofs_layout_od_id(struct exofs_layout *layout, | ||
212 | osd_id obj_no, unsigned layout_index); | ||
213 | /* | ||
214 | * Maximum count of links to a file | 141 | * Maximum count of links to a file |
215 | */ | 142 | */ |
216 | #define EXOFS_LINK_MAX 32000 | 143 | #define EXOFS_LINK_MAX 32000 |
@@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout, | |||
219 | * function declarations * | 146 | * function declarations * |
220 | *************************/ | 147 | *************************/ |
221 | 148 | ||
222 | /* ios.c */ | ||
223 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], | ||
224 | const struct osd_obj_id *obj); | ||
225 | int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, | ||
226 | u64 offset, void *p, unsigned length); | ||
227 | |||
228 | int exofs_get_io_state(struct exofs_layout *layout, | ||
229 | struct exofs_io_state **ios); | ||
230 | void exofs_put_io_state(struct exofs_io_state *ios); | ||
231 | |||
232 | int exofs_check_io(struct exofs_io_state *ios, u64 *resid); | ||
233 | |||
234 | int exofs_sbi_create(struct exofs_io_state *ios); | ||
235 | int exofs_sbi_remove(struct exofs_io_state *ios); | ||
236 | int exofs_sbi_write(struct exofs_io_state *ios); | ||
237 | int exofs_sbi_read(struct exofs_io_state *ios); | ||
238 | |||
239 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); | ||
240 | |||
241 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); | ||
242 | static inline int exofs_oi_write(struct exofs_i_info *oi, | ||
243 | struct exofs_io_state *ios) | ||
244 | { | ||
245 | ios->obj.id = exofs_oi_objno(oi); | ||
246 | ios->cred = oi->i_cred; | ||
247 | return exofs_sbi_write(ios); | ||
248 | } | ||
249 | |||
250 | static inline int exofs_oi_read(struct exofs_i_info *oi, | ||
251 | struct exofs_io_state *ios) | ||
252 | { | ||
253 | ios->obj.id = exofs_oi_objno(oi); | ||
254 | ios->cred = oi->i_cred; | ||
255 | return exofs_sbi_read(ios); | ||
256 | } | ||
257 | |||
258 | /* inode.c */ | 149 | /* inode.c */ |
259 | unsigned exofs_max_io_pages(struct exofs_layout *layout, | 150 | unsigned exofs_max_io_pages(struct ore_layout *layout, |
260 | unsigned expected_pages); | 151 | unsigned expected_pages); |
261 | int exofs_setattr(struct dentry *, struct iattr *); | 152 | int exofs_setattr(struct dentry *, struct iattr *); |
262 | int exofs_write_begin(struct file *file, struct address_space *mapping, | 153 | int exofs_write_begin(struct file *file, struct address_space *mapping, |
@@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, | |||
281 | struct inode *); | 172 | struct inode *); |
282 | 173 | ||
283 | /* super.c */ | 174 | /* super.c */ |
175 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], | ||
176 | const struct osd_obj_id *obj); | ||
284 | int exofs_sbi_write_stats(struct exofs_sb_info *sbi); | 177 | int exofs_sbi_write_stats(struct exofs_sb_info *sbi); |
285 | 178 | ||
286 | /********************* | 179 | /********************* |
@@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations; | |||
295 | 188 | ||
296 | /* inode.c */ | 189 | /* inode.c */ |
297 | extern const struct address_space_operations exofs_aops; | 190 | extern const struct address_space_operations exofs_aops; |
298 | extern const struct osd_attr g_attr_logical_length; | ||
299 | 191 | ||
300 | /* namei.c */ | 192 | /* namei.c */ |
301 | extern const struct inode_operations exofs_dir_inode_operations; | 193 | extern const struct inode_operations exofs_dir_inode_operations; |
@@ -305,4 +197,33 @@ extern const struct inode_operations exofs_special_inode_operations; | |||
305 | extern const struct inode_operations exofs_symlink_inode_operations; | 197 | extern const struct inode_operations exofs_symlink_inode_operations; |
306 | extern const struct inode_operations exofs_fast_symlink_inode_operations; | 198 | extern const struct inode_operations exofs_fast_symlink_inode_operations; |
307 | 199 | ||
200 | /* exofs_init_comps will initialize an ore_components device array | ||
201 | * pointing to a single ore_comp struct, and a round-robin view | ||
202 | * of the device table. | ||
203 | * The first device of each inode is the [inode->ino % num_devices] | ||
204 | * and the rest of the devices sequentially following where the | ||
205 | * first device is after the last device. | ||
206 | * It is assumed that the global device array at @sbi is twice | ||
207 | * bigger and that the device table repeats twice. | ||
208 | * See: exofs_read_lookup_dev_table() | ||
209 | */ | ||
210 | static inline void exofs_init_comps(struct ore_components *comps, | ||
211 | struct ore_comp *one_comp, | ||
212 | struct exofs_sb_info *sbi, osd_id oid) | ||
213 | { | ||
214 | unsigned dev_mod = (unsigned)oid, first_dev; | ||
215 | |||
216 | one_comp->obj.partition = sbi->one_comp.obj.partition; | ||
217 | one_comp->obj.id = oid; | ||
218 | exofs_make_credential(one_comp->cred, &one_comp->obj); | ||
219 | |||
220 | comps->numdevs = sbi->comps.numdevs; | ||
221 | comps->single_comp = EC_SINGLE_COMP; | ||
222 | comps->comps = one_comp; | ||
223 | |||
224 | /* Round robin device view of the table */ | ||
225 | first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs; | ||
226 | comps->ods = sbi->comps.ods + first_dev; | ||
227 | } | ||
228 | |||
308 | #endif | 229 | #endif |
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 8472c098445d..f39a38fc2349 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c | |||
@@ -43,7 +43,7 @@ enum { BIO_MAX_PAGES_KMALLOC = | |||
43 | PAGE_SIZE / sizeof(struct page *), | 43 | PAGE_SIZE / sizeof(struct page *), |
44 | }; | 44 | }; |
45 | 45 | ||
46 | unsigned exofs_max_io_pages(struct exofs_layout *layout, | 46 | unsigned exofs_max_io_pages(struct ore_layout *layout, |
47 | unsigned expected_pages) | 47 | unsigned expected_pages) |
48 | { | 48 | { |
49 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); | 49 | unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); |
@@ -58,7 +58,7 @@ struct page_collect { | |||
58 | struct exofs_sb_info *sbi; | 58 | struct exofs_sb_info *sbi; |
59 | struct inode *inode; | 59 | struct inode *inode; |
60 | unsigned expected_pages; | 60 | unsigned expected_pages; |
61 | struct exofs_io_state *ios; | 61 | struct ore_io_state *ios; |
62 | 62 | ||
63 | struct page **pages; | 63 | struct page **pages; |
64 | unsigned alloc_pages; | 64 | unsigned alloc_pages; |
@@ -110,13 +110,6 @@ static int pcol_try_alloc(struct page_collect *pcol) | |||
110 | { | 110 | { |
111 | unsigned pages; | 111 | unsigned pages; |
112 | 112 | ||
113 | if (!pcol->ios) { /* First time allocate io_state */ | ||
114 | int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); | ||
115 | |||
116 | if (ret) | ||
117 | return ret; | ||
118 | } | ||
119 | |||
120 | /* TODO: easily support bio chaining */ | 113 | /* TODO: easily support bio chaining */ |
121 | pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); | 114 | pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); |
122 | 115 | ||
@@ -140,7 +133,7 @@ static void pcol_free(struct page_collect *pcol) | |||
140 | pcol->pages = NULL; | 133 | pcol->pages = NULL; |
141 | 134 | ||
142 | if (pcol->ios) { | 135 | if (pcol->ios) { |
143 | exofs_put_io_state(pcol->ios); | 136 | ore_put_io_state(pcol->ios); |
144 | pcol->ios = NULL; | 137 | pcol->ios = NULL; |
145 | } | 138 | } |
146 | } | 139 | } |
@@ -200,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol) | |||
200 | u64 resid; | 193 | u64 resid; |
201 | u64 good_bytes; | 194 | u64 good_bytes; |
202 | u64 length = 0; | 195 | u64 length = 0; |
203 | int ret = exofs_check_io(pcol->ios, &resid); | 196 | int ret = ore_check_io(pcol->ios, &resid); |
204 | 197 | ||
205 | if (likely(!ret)) | 198 | if (likely(!ret)) |
206 | good_bytes = pcol->length; | 199 | good_bytes = pcol->length; |
@@ -241,7 +234,7 @@ static int __readpages_done(struct page_collect *pcol) | |||
241 | } | 234 | } |
242 | 235 | ||
243 | /* callback of async reads */ | 236 | /* callback of async reads */ |
244 | static void readpages_done(struct exofs_io_state *ios, void *p) | 237 | static void readpages_done(struct ore_io_state *ios, void *p) |
245 | { | 238 | { |
246 | struct page_collect *pcol = p; | 239 | struct page_collect *pcol = p; |
247 | 240 | ||
@@ -269,20 +262,28 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) | |||
269 | static int read_exec(struct page_collect *pcol) | 262 | static int read_exec(struct page_collect *pcol) |
270 | { | 263 | { |
271 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 264 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
272 | struct exofs_io_state *ios = pcol->ios; | 265 | struct ore_io_state *ios; |
273 | struct page_collect *pcol_copy = NULL; | 266 | struct page_collect *pcol_copy = NULL; |
274 | int ret; | 267 | int ret; |
275 | 268 | ||
276 | if (!pcol->pages) | 269 | if (!pcol->pages) |
277 | return 0; | 270 | return 0; |
278 | 271 | ||
272 | if (!pcol->ios) { | ||
273 | int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true, | ||
274 | pcol->pg_first << PAGE_CACHE_SHIFT, | ||
275 | pcol->length, &pcol->ios); | ||
276 | |||
277 | if (ret) | ||
278 | return ret; | ||
279 | } | ||
280 | |||
281 | ios = pcol->ios; | ||
279 | ios->pages = pcol->pages; | 282 | ios->pages = pcol->pages; |
280 | ios->nr_pages = pcol->nr_pages; | 283 | ios->nr_pages = pcol->nr_pages; |
281 | ios->length = pcol->length; | ||
282 | ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; | ||
283 | 284 | ||
284 | if (pcol->read_4_write) { | 285 | if (pcol->read_4_write) { |
285 | exofs_oi_read(oi, pcol->ios); | 286 | ore_read(pcol->ios); |
286 | return __readpages_done(pcol); | 287 | return __readpages_done(pcol); |
287 | } | 288 | } |
288 | 289 | ||
@@ -295,14 +296,14 @@ static int read_exec(struct page_collect *pcol) | |||
295 | *pcol_copy = *pcol; | 296 | *pcol_copy = *pcol; |
296 | ios->done = readpages_done; | 297 | ios->done = readpages_done; |
297 | ios->private = pcol_copy; | 298 | ios->private = pcol_copy; |
298 | ret = exofs_oi_read(oi, ios); | 299 | ret = ore_read(ios); |
299 | if (unlikely(ret)) | 300 | if (unlikely(ret)) |
300 | goto err; | 301 | goto err; |
301 | 302 | ||
302 | atomic_inc(&pcol->sbi->s_curr_pending); | 303 | atomic_inc(&pcol->sbi->s_curr_pending); |
303 | 304 | ||
304 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", | 305 | EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", |
305 | ios->obj.id, _LLU(ios->offset), pcol->length); | 306 | oi->one_comp.obj.id, _LLU(ios->offset), pcol->length); |
306 | 307 | ||
307 | /* pages ownership was passed to pcol_copy */ | 308 | /* pages ownership was passed to pcol_copy */ |
308 | _pcol_reset(pcol); | 309 | _pcol_reset(pcol); |
@@ -457,14 +458,14 @@ static int exofs_readpage(struct file *file, struct page *page) | |||
457 | } | 458 | } |
458 | 459 | ||
459 | /* Callback for osd_write. All writes are asynchronous */ | 460 | /* Callback for osd_write. All writes are asynchronous */ |
460 | static void writepages_done(struct exofs_io_state *ios, void *p) | 461 | static void writepages_done(struct ore_io_state *ios, void *p) |
461 | { | 462 | { |
462 | struct page_collect *pcol = p; | 463 | struct page_collect *pcol = p; |
463 | int i; | 464 | int i; |
464 | u64 resid; | 465 | u64 resid; |
465 | u64 good_bytes; | 466 | u64 good_bytes; |
466 | u64 length = 0; | 467 | u64 length = 0; |
467 | int ret = exofs_check_io(ios, &resid); | 468 | int ret = ore_check_io(ios, &resid); |
468 | 469 | ||
469 | atomic_dec(&pcol->sbi->s_curr_pending); | 470 | atomic_dec(&pcol->sbi->s_curr_pending); |
470 | 471 | ||
@@ -507,13 +508,21 @@ static void writepages_done(struct exofs_io_state *ios, void *p) | |||
507 | static int write_exec(struct page_collect *pcol) | 508 | static int write_exec(struct page_collect *pcol) |
508 | { | 509 | { |
509 | struct exofs_i_info *oi = exofs_i(pcol->inode); | 510 | struct exofs_i_info *oi = exofs_i(pcol->inode); |
510 | struct exofs_io_state *ios = pcol->ios; | 511 | struct ore_io_state *ios; |
511 | struct page_collect *pcol_copy = NULL; | 512 | struct page_collect *pcol_copy = NULL; |
512 | int ret; | 513 | int ret; |
513 | 514 | ||
514 | if (!pcol->pages) | 515 | if (!pcol->pages) |
515 | return 0; | 516 | return 0; |
516 | 517 | ||
518 | BUG_ON(pcol->ios); | ||
519 | ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false, | ||
520 | pcol->pg_first << PAGE_CACHE_SHIFT, | ||
521 | pcol->length, &pcol->ios); | ||
522 | |||
523 | if (unlikely(ret)) | ||
524 | goto err; | ||
525 | |||
517 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); | 526 | pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); |
518 | if (!pcol_copy) { | 527 | if (!pcol_copy) { |
519 | EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); | 528 | EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); |
@@ -523,16 +532,15 @@ static int write_exec(struct page_collect *pcol) | |||
523 | 532 | ||
524 | *pcol_copy = *pcol; | 533 | *pcol_copy = *pcol; |
525 | 534 | ||
535 | ios = pcol->ios; | ||
526 | ios->pages = pcol_copy->pages; | 536 | ios->pages = pcol_copy->pages; |
527 | ios->nr_pages = pcol_copy->nr_pages; | 537 | ios->nr_pages = pcol_copy->nr_pages; |
528 | ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; | ||
529 | ios->length = pcol_copy->length; | ||
530 | ios->done = writepages_done; | 538 | ios->done = writepages_done; |
531 | ios->private = pcol_copy; | 539 | ios->private = pcol_copy; |
532 | 540 | ||
533 | ret = exofs_oi_write(oi, ios); | 541 | ret = ore_write(ios); |
534 | if (unlikely(ret)) { | 542 | if (unlikely(ret)) { |
535 | EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); | 543 | EXOFS_ERR("write_exec: ore_write() Failed\n"); |
536 | goto err; | 544 | goto err; |
537 | } | 545 | } |
538 | 546 | ||
@@ -844,17 +852,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode) | |||
844 | return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); | 852 | return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); |
845 | } | 853 | } |
846 | 854 | ||
847 | const struct osd_attr g_attr_logical_length = ATTR_DEF( | ||
848 | OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); | ||
849 | |||
850 | static int _do_truncate(struct inode *inode, loff_t newsize) | 855 | static int _do_truncate(struct inode *inode, loff_t newsize) |
851 | { | 856 | { |
852 | struct exofs_i_info *oi = exofs_i(inode); | 857 | struct exofs_i_info *oi = exofs_i(inode); |
858 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; | ||
853 | int ret; | 859 | int ret; |
854 | 860 | ||
855 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 861 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
856 | 862 | ||
857 | ret = exofs_oi_truncate(oi, (u64)newsize); | 863 | ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize); |
858 | if (likely(!ret)) | 864 | if (likely(!ret)) |
859 | truncate_setsize(inode, newsize); | 865 | truncate_setsize(inode, newsize); |
860 | 866 | ||
@@ -917,30 +923,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
917 | [1] = g_attr_inode_file_layout, | 923 | [1] = g_attr_inode_file_layout, |
918 | [2] = g_attr_inode_dir_layout, | 924 | [2] = g_attr_inode_dir_layout, |
919 | }; | 925 | }; |
920 | struct exofs_io_state *ios; | 926 | struct ore_io_state *ios; |
921 | struct exofs_on_disk_inode_layout *layout; | 927 | struct exofs_on_disk_inode_layout *layout; |
922 | int ret; | 928 | int ret; |
923 | 929 | ||
924 | ret = exofs_get_io_state(&sbi->layout, &ios); | 930 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); |
925 | if (unlikely(ret)) { | 931 | if (unlikely(ret)) { |
926 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | 932 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
927 | return ret; | 933 | return ret; |
928 | } | 934 | } |
929 | 935 | ||
930 | ios->obj.id = exofs_oi_objno(oi); | 936 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); |
931 | exofs_make_credential(oi->i_cred, &ios->obj); | 937 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs); |
932 | ios->cred = oi->i_cred; | ||
933 | |||
934 | attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); | ||
935 | attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); | ||
936 | 938 | ||
937 | ios->in_attr = attrs; | 939 | ios->in_attr = attrs; |
938 | ios->in_attr_len = ARRAY_SIZE(attrs); | 940 | ios->in_attr_len = ARRAY_SIZE(attrs); |
939 | 941 | ||
940 | ret = exofs_sbi_read(ios); | 942 | ret = ore_read(ios); |
941 | if (unlikely(ret)) { | 943 | if (unlikely(ret)) { |
942 | EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", | 944 | EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", |
943 | _LLU(ios->obj.id), ret); | 945 | _LLU(oi->one_comp.obj.id), ret); |
944 | memset(inode, 0, sizeof(*inode)); | 946 | memset(inode, 0, sizeof(*inode)); |
945 | inode->i_mode = 0040000 | (0777 & ~022); | 947 | inode->i_mode = 0040000 | (0777 & ~022); |
946 | /* If object is lost on target we might as well enable it's | 948 | /* If object is lost on target we might as well enable it's |
@@ -990,7 +992,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, | |||
990 | } | 992 | } |
991 | 993 | ||
992 | out: | 994 | out: |
993 | exofs_put_io_state(ios); | 995 | ore_put_io_state(ios); |
994 | return ret; | 996 | return ret; |
995 | } | 997 | } |
996 | 998 | ||
@@ -1016,6 +1018,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) | |||
1016 | return inode; | 1018 | return inode; |
1017 | oi = exofs_i(inode); | 1019 | oi = exofs_i(inode); |
1018 | __oi_init(oi); | 1020 | __oi_init(oi); |
1021 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | ||
1022 | exofs_oi_objno(oi)); | ||
1019 | 1023 | ||
1020 | /* read the inode from the osd */ | 1024 | /* read the inode from the osd */ |
1021 | ret = exofs_get_inode(sb, oi, &fcb); | 1025 | ret = exofs_get_inode(sb, oi, &fcb); |
@@ -1107,21 +1111,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) | |||
1107 | * set the obj_created flag so that other methods know that the object exists on | 1111 | * set the obj_created flag so that other methods know that the object exists on |
1108 | * the OSD. | 1112 | * the OSD. |
1109 | */ | 1113 | */ |
1110 | static void create_done(struct exofs_io_state *ios, void *p) | 1114 | static void create_done(struct ore_io_state *ios, void *p) |
1111 | { | 1115 | { |
1112 | struct inode *inode = p; | 1116 | struct inode *inode = p; |
1113 | struct exofs_i_info *oi = exofs_i(inode); | 1117 | struct exofs_i_info *oi = exofs_i(inode); |
1114 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; | 1118 | struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; |
1115 | int ret; | 1119 | int ret; |
1116 | 1120 | ||
1117 | ret = exofs_check_io(ios, NULL); | 1121 | ret = ore_check_io(ios, NULL); |
1118 | exofs_put_io_state(ios); | 1122 | ore_put_io_state(ios); |
1119 | 1123 | ||
1120 | atomic_dec(&sbi->s_curr_pending); | 1124 | atomic_dec(&sbi->s_curr_pending); |
1121 | 1125 | ||
1122 | if (unlikely(ret)) { | 1126 | if (unlikely(ret)) { |
1123 | EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", | 1127 | EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", |
1124 | _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); | 1128 | _LLU(exofs_oi_objno(oi)), |
1129 | _LLU(oi->one_comp.obj.partition)); | ||
1125 | /*TODO: When FS is corrupted creation can fail, object already | 1130 | /*TODO: When FS is corrupted creation can fail, object already |
1126 | * exist. Get rid of this asynchronous creation, if exist | 1131 | * exist. Get rid of this asynchronous creation, if exist |
1127 | * increment the obj counter and try the next object. Until we | 1132 | * increment the obj counter and try the next object. Until we |
@@ -1140,14 +1145,13 @@ static void create_done(struct exofs_io_state *ios, void *p) | |||
1140 | */ | 1145 | */ |
1141 | struct inode *exofs_new_inode(struct inode *dir, int mode) | 1146 | struct inode *exofs_new_inode(struct inode *dir, int mode) |
1142 | { | 1147 | { |
1143 | struct super_block *sb; | 1148 | struct super_block *sb = dir->i_sb; |
1149 | struct exofs_sb_info *sbi = sb->s_fs_info; | ||
1144 | struct inode *inode; | 1150 | struct inode *inode; |
1145 | struct exofs_i_info *oi; | 1151 | struct exofs_i_info *oi; |
1146 | struct exofs_sb_info *sbi; | 1152 | struct ore_io_state *ios; |
1147 | struct exofs_io_state *ios; | ||
1148 | int ret; | 1153 | int ret; |
1149 | 1154 | ||
1150 | sb = dir->i_sb; | ||
1151 | inode = new_inode(sb); | 1155 | inode = new_inode(sb); |
1152 | if (!inode) | 1156 | if (!inode) |
1153 | return ERR_PTR(-ENOMEM); | 1157 | return ERR_PTR(-ENOMEM); |
@@ -1157,8 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1157 | 1161 | ||
1158 | set_obj_2bcreated(oi); | 1162 | set_obj_2bcreated(oi); |
1159 | 1163 | ||
1160 | sbi = sb->s_fs_info; | ||
1161 | |||
1162 | inode->i_mapping->backing_dev_info = sb->s_bdi; | 1164 | inode->i_mapping->backing_dev_info = sb->s_bdi; |
1163 | inode_init_owner(inode, dir, mode); | 1165 | inode_init_owner(inode, dir, mode); |
1164 | inode->i_ino = sbi->s_nextid++; | 1166 | inode->i_ino = sbi->s_nextid++; |
@@ -1170,25 +1172,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) | |||
1170 | spin_unlock(&sbi->s_next_gen_lock); | 1172 | spin_unlock(&sbi->s_next_gen_lock); |
1171 | insert_inode_hash(inode); | 1173 | insert_inode_hash(inode); |
1172 | 1174 | ||
1175 | exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info, | ||
1176 | exofs_oi_objno(oi)); | ||
1173 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ | 1177 | exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ |
1174 | 1178 | ||
1175 | mark_inode_dirty(inode); | 1179 | mark_inode_dirty(inode); |
1176 | 1180 | ||
1177 | ret = exofs_get_io_state(&sbi->layout, &ios); | 1181 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); |
1178 | if (unlikely(ret)) { | 1182 | if (unlikely(ret)) { |
1179 | EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); | 1183 | EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); |
1180 | return ERR_PTR(ret); | 1184 | return ERR_PTR(ret); |
1181 | } | 1185 | } |
1182 | 1186 | ||
1183 | ios->obj.id = exofs_oi_objno(oi); | ||
1184 | exofs_make_credential(oi->i_cred, &ios->obj); | ||
1185 | |||
1186 | ios->done = create_done; | 1187 | ios->done = create_done; |
1187 | ios->private = inode; | 1188 | ios->private = inode; |
1188 | ios->cred = oi->i_cred; | 1189 | |
1189 | ret = exofs_sbi_create(ios); | 1190 | ret = ore_create(ios); |
1190 | if (ret) { | 1191 | if (ret) { |
1191 | exofs_put_io_state(ios); | 1192 | ore_put_io_state(ios); |
1192 | return ERR_PTR(ret); | 1193 | return ERR_PTR(ret); |
1193 | } | 1194 | } |
1194 | atomic_inc(&sbi->s_curr_pending); | 1195 | atomic_inc(&sbi->s_curr_pending); |
@@ -1207,11 +1208,11 @@ struct updatei_args { | |||
1207 | /* | 1208 | /* |
1208 | * Callback function from exofs_update_inode(). | 1209 | * Callback function from exofs_update_inode(). |
1209 | */ | 1210 | */ |
1210 | static void updatei_done(struct exofs_io_state *ios, void *p) | 1211 | static void updatei_done(struct ore_io_state *ios, void *p) |
1211 | { | 1212 | { |
1212 | struct updatei_args *args = p; | 1213 | struct updatei_args *args = p; |
1213 | 1214 | ||
1214 | exofs_put_io_state(ios); | 1215 | ore_put_io_state(ios); |
1215 | 1216 | ||
1216 | atomic_dec(&args->sbi->s_curr_pending); | 1217 | atomic_dec(&args->sbi->s_curr_pending); |
1217 | 1218 | ||
@@ -1227,7 +1228,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1227 | struct exofs_i_info *oi = exofs_i(inode); | 1228 | struct exofs_i_info *oi = exofs_i(inode); |
1228 | struct super_block *sb = inode->i_sb; | 1229 | struct super_block *sb = inode->i_sb; |
1229 | struct exofs_sb_info *sbi = sb->s_fs_info; | 1230 | struct exofs_sb_info *sbi = sb->s_fs_info; |
1230 | struct exofs_io_state *ios; | 1231 | struct ore_io_state *ios; |
1231 | struct osd_attr attr; | 1232 | struct osd_attr attr; |
1232 | struct exofs_fcb *fcb; | 1233 | struct exofs_fcb *fcb; |
1233 | struct updatei_args *args; | 1234 | struct updatei_args *args; |
@@ -1266,9 +1267,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1266 | } else | 1267 | } else |
1267 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); | 1268 | memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); |
1268 | 1269 | ||
1269 | ret = exofs_get_io_state(&sbi->layout, &ios); | 1270 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); |
1270 | if (unlikely(ret)) { | 1271 | if (unlikely(ret)) { |
1271 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | 1272 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
1272 | goto free_args; | 1273 | goto free_args; |
1273 | } | 1274 | } |
1274 | 1275 | ||
@@ -1285,13 +1286,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync) | |||
1285 | ios->private = args; | 1286 | ios->private = args; |
1286 | } | 1287 | } |
1287 | 1288 | ||
1288 | ret = exofs_oi_write(oi, ios); | 1289 | ret = ore_write(ios); |
1289 | if (!do_sync && !ret) { | 1290 | if (!do_sync && !ret) { |
1290 | atomic_inc(&sbi->s_curr_pending); | 1291 | atomic_inc(&sbi->s_curr_pending); |
1291 | goto out; /* deallocation in updatei_done */ | 1292 | goto out; /* deallocation in updatei_done */ |
1292 | } | 1293 | } |
1293 | 1294 | ||
1294 | exofs_put_io_state(ios); | 1295 | ore_put_io_state(ios); |
1295 | free_args: | 1296 | free_args: |
1296 | kfree(args); | 1297 | kfree(args); |
1297 | out: | 1298 | out: |
@@ -1310,11 +1311,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) | |||
1310 | * Callback function from exofs_delete_inode() - don't have much cleaning up to | 1311 | * Callback function from exofs_delete_inode() - don't have much cleaning up to |
1311 | * do. | 1312 | * do. |
1312 | */ | 1313 | */ |
1313 | static void delete_done(struct exofs_io_state *ios, void *p) | 1314 | static void delete_done(struct ore_io_state *ios, void *p) |
1314 | { | 1315 | { |
1315 | struct exofs_sb_info *sbi = p; | 1316 | struct exofs_sb_info *sbi = p; |
1316 | 1317 | ||
1317 | exofs_put_io_state(ios); | 1318 | ore_put_io_state(ios); |
1318 | 1319 | ||
1319 | atomic_dec(&sbi->s_curr_pending); | 1320 | atomic_dec(&sbi->s_curr_pending); |
1320 | } | 1321 | } |
@@ -1329,7 +1330,7 @@ void exofs_evict_inode(struct inode *inode) | |||
1329 | struct exofs_i_info *oi = exofs_i(inode); | 1330 | struct exofs_i_info *oi = exofs_i(inode); |
1330 | struct super_block *sb = inode->i_sb; | 1331 | struct super_block *sb = inode->i_sb; |
1331 | struct exofs_sb_info *sbi = sb->s_fs_info; | 1332 | struct exofs_sb_info *sbi = sb->s_fs_info; |
1332 | struct exofs_io_state *ios; | 1333 | struct ore_io_state *ios; |
1333 | int ret; | 1334 | int ret; |
1334 | 1335 | ||
1335 | truncate_inode_pages(&inode->i_data, 0); | 1336 | truncate_inode_pages(&inode->i_data, 0); |
@@ -1349,20 +1350,19 @@ void exofs_evict_inode(struct inode *inode) | |||
1349 | /* ignore the error, attempt a remove anyway */ | 1350 | /* ignore the error, attempt a remove anyway */ |
1350 | 1351 | ||
1351 | /* Now Remove the OSD objects */ | 1352 | /* Now Remove the OSD objects */ |
1352 | ret = exofs_get_io_state(&sbi->layout, &ios); | 1353 | ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios); |
1353 | if (unlikely(ret)) { | 1354 | if (unlikely(ret)) { |
1354 | EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); | 1355 | EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); |
1355 | return; | 1356 | return; |
1356 | } | 1357 | } |
1357 | 1358 | ||
1358 | ios->obj.id = exofs_oi_objno(oi); | ||
1359 | ios->done = delete_done; | 1359 | ios->done = delete_done; |
1360 | ios->private = sbi; | 1360 | ios->private = sbi; |
1361 | ios->cred = oi->i_cred; | 1361 | |
1362 | ret = exofs_sbi_remove(ios); | 1362 | ret = ore_remove(ios); |
1363 | if (ret) { | 1363 | if (ret) { |
1364 | EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); | 1364 | EXOFS_ERR("%s: ore_remove failed\n", __func__); |
1365 | exofs_put_io_state(ios); | 1365 | ore_put_io_state(ios); |
1366 | return; | 1366 | return; |
1367 | } | 1367 | } |
1368 | atomic_inc(&sbi->s_curr_pending); | 1368 | atomic_inc(&sbi->s_curr_pending); |
diff --git a/fs/exofs/ios.c b/fs/exofs/ore.c index f74a2ec027a6..25305af88198 100644 --- a/fs/exofs/ios.c +++ b/fs/exofs/ore.c | |||
@@ -23,81 +23,87 @@ | |||
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
26 | #include <scsi/scsi_device.h> | ||
27 | #include <asm/div64.h> | 26 | #include <asm/div64.h> |
28 | 27 | ||
29 | #include "exofs.h" | 28 | #include <scsi/osd_ore.h> |
30 | 29 | ||
31 | #define EXOFS_DBGMSG2(M...) do {} while (0) | 30 | #define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) |
32 | /* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ | ||
33 | 31 | ||
34 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) | 32 | #ifdef CONFIG_EXOFS_DEBUG |
35 | { | 33 | #define ORE_DBGMSG(fmt, a...) \ |
36 | osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); | 34 | printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) |
37 | } | 35 | #else |
36 | #define ORE_DBGMSG(fmt, a...) \ | ||
37 | do { if (0) printk(fmt, ##a); } while (0) | ||
38 | #endif | ||
38 | 39 | ||
39 | int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, | 40 | /* u64 has problems with printk this will cast it to unsigned long long */ |
40 | u64 offset, void *p, unsigned length) | 41 | #define _LLU(x) (unsigned long long)(x) |
41 | { | ||
42 | struct osd_request *or = osd_start_request(od, GFP_KERNEL); | ||
43 | /* struct osd_sense_info osi = {.key = 0};*/ | ||
44 | int ret; | ||
45 | 42 | ||
46 | if (unlikely(!or)) { | 43 | #define ORE_DBGMSG2(M...) do {} while (0) |
47 | EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); | 44 | /* #define ORE_DBGMSG2 ORE_DBGMSG */ |
48 | return -ENOMEM; | ||
49 | } | ||
50 | ret = osd_req_read_kern(or, obj, offset, p, length); | ||
51 | if (unlikely(ret)) { | ||
52 | EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); | ||
53 | goto out; | ||
54 | } | ||
55 | 45 | ||
56 | ret = osd_finalize_request(or, 0, cred, NULL); | 46 | MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); |
57 | if (unlikely(ret)) { | 47 | MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); |
58 | EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); | 48 | MODULE_LICENSE("GPL"); |
59 | goto out; | ||
60 | } | ||
61 | 49 | ||
62 | ret = osd_execute_request(or); | 50 | static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) |
63 | if (unlikely(ret)) | 51 | { |
64 | EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); | 52 | return ios->comps->comps[index & ios->comps->single_comp].cred; |
65 | /* osd_req_decode_sense(or, ret); */ | 53 | } |
66 | 54 | ||
67 | out: | 55 | static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) |
68 | osd_end_request(or); | 56 | { |
69 | return ret; | 57 | return &ios->comps->comps[index & ios->comps->single_comp].obj; |
70 | } | 58 | } |
71 | 59 | ||
72 | int exofs_get_io_state(struct exofs_layout *layout, | 60 | static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) |
73 | struct exofs_io_state **pios) | ||
74 | { | 61 | { |
75 | struct exofs_io_state *ios; | 62 | return ios->comps->ods[index]; |
63 | } | ||
64 | |||
65 | int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps, | ||
66 | bool is_reading, u64 offset, u64 length, | ||
67 | struct ore_io_state **pios) | ||
68 | { | ||
69 | struct ore_io_state *ios; | ||
76 | 70 | ||
77 | /*TODO: Maybe use kmem_cach per sbi of size | 71 | /*TODO: Maybe use kmem_cach per sbi of size |
78 | * exofs_io_state_size(layout->s_numdevs) | 72 | * exofs_io_state_size(layout->s_numdevs) |
79 | */ | 73 | */ |
80 | ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); | 74 | ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL); |
81 | if (unlikely(!ios)) { | 75 | if (unlikely(!ios)) { |
82 | EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", | 76 | ORE_DBGMSG("Failed kzalloc bytes=%d\n", |
83 | exofs_io_state_size(layout->s_numdevs)); | 77 | ore_io_state_size(comps->numdevs)); |
84 | *pios = NULL; | 78 | *pios = NULL; |
85 | return -ENOMEM; | 79 | return -ENOMEM; |
86 | } | 80 | } |
87 | 81 | ||
88 | ios->layout = layout; | 82 | ios->layout = layout; |
89 | ios->obj.partition = layout->s_pid; | 83 | ios->comps = comps; |
84 | ios->offset = offset; | ||
85 | ios->length = length; | ||
86 | ios->reading = is_reading; | ||
87 | |||
90 | *pios = ios; | 88 | *pios = ios; |
91 | return 0; | 89 | return 0; |
92 | } | 90 | } |
91 | EXPORT_SYMBOL(ore_get_rw_state); | ||
92 | |||
93 | int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps, | ||
94 | struct ore_io_state **ios) | ||
95 | { | ||
96 | return ore_get_rw_state(layout, comps, true, 0, 0, ios); | ||
97 | } | ||
98 | EXPORT_SYMBOL(ore_get_io_state); | ||
93 | 99 | ||
94 | void exofs_put_io_state(struct exofs_io_state *ios) | 100 | void ore_put_io_state(struct ore_io_state *ios) |
95 | { | 101 | { |
96 | if (ios) { | 102 | if (ios) { |
97 | unsigned i; | 103 | unsigned i; |
98 | 104 | ||
99 | for (i = 0; i < ios->numdevs; i++) { | 105 | for (i = 0; i < ios->numdevs; i++) { |
100 | struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; | 106 | struct ore_per_dev_state *per_dev = &ios->per_dev[i]; |
101 | 107 | ||
102 | if (per_dev->or) | 108 | if (per_dev->or) |
103 | osd_end_request(per_dev->or); | 109 | osd_end_request(per_dev->or); |
@@ -108,31 +114,9 @@ void exofs_put_io_state(struct exofs_io_state *ios) | |||
108 | kfree(ios); | 114 | kfree(ios); |
109 | } | 115 | } |
110 | } | 116 | } |
117 | EXPORT_SYMBOL(ore_put_io_state); | ||
111 | 118 | ||
112 | unsigned exofs_layout_od_id(struct exofs_layout *layout, | 119 | static void _sync_done(struct ore_io_state *ios, void *p) |
113 | osd_id obj_no, unsigned layout_index) | ||
114 | { | ||
115 | /* switch (layout->lay_func) { | ||
116 | case LAYOUT_MOVING_WINDOW: | ||
117 | {*/ | ||
118 | unsigned dev_mod = obj_no; | ||
119 | |||
120 | return (layout_index + dev_mod * layout->mirrors_p1) % | ||
121 | layout->s_numdevs; | ||
122 | /* } | ||
123 | case LAYOUT_FUNC_IMPLICT: | ||
124 | return layout->devs[layout_index]; | ||
125 | }*/ | ||
126 | } | ||
127 | |||
128 | static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, | ||
129 | unsigned layout_index) | ||
130 | { | ||
131 | return ios->layout->s_ods[ | ||
132 | exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; | ||
133 | } | ||
134 | |||
135 | static void _sync_done(struct exofs_io_state *ios, void *p) | ||
136 | { | 120 | { |
137 | struct completion *waiting = p; | 121 | struct completion *waiting = p; |
138 | 122 | ||
@@ -141,20 +125,20 @@ static void _sync_done(struct exofs_io_state *ios, void *p) | |||
141 | 125 | ||
142 | static void _last_io(struct kref *kref) | 126 | static void _last_io(struct kref *kref) |
143 | { | 127 | { |
144 | struct exofs_io_state *ios = container_of( | 128 | struct ore_io_state *ios = container_of( |
145 | kref, struct exofs_io_state, kref); | 129 | kref, struct ore_io_state, kref); |
146 | 130 | ||
147 | ios->done(ios, ios->private); | 131 | ios->done(ios, ios->private); |
148 | } | 132 | } |
149 | 133 | ||
150 | static void _done_io(struct osd_request *or, void *p) | 134 | static void _done_io(struct osd_request *or, void *p) |
151 | { | 135 | { |
152 | struct exofs_io_state *ios = p; | 136 | struct ore_io_state *ios = p; |
153 | 137 | ||
154 | kref_put(&ios->kref, _last_io); | 138 | kref_put(&ios->kref, _last_io); |
155 | } | 139 | } |
156 | 140 | ||
157 | static int exofs_io_execute(struct exofs_io_state *ios) | 141 | static int ore_io_execute(struct ore_io_state *ios) |
158 | { | 142 | { |
159 | DECLARE_COMPLETION_ONSTACK(wait); | 143 | DECLARE_COMPLETION_ONSTACK(wait); |
160 | bool sync = (ios->done == NULL); | 144 | bool sync = (ios->done == NULL); |
@@ -170,9 +154,9 @@ static int exofs_io_execute(struct exofs_io_state *ios) | |||
170 | if (unlikely(!or)) | 154 | if (unlikely(!or)) |
171 | continue; | 155 | continue; |
172 | 156 | ||
173 | ret = osd_finalize_request(or, 0, ios->cred, NULL); | 157 | ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); |
174 | if (unlikely(ret)) { | 158 | if (unlikely(ret)) { |
175 | EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", | 159 | ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", |
176 | ret); | 160 | ret); |
177 | return ret; | 161 | return ret; |
178 | } | 162 | } |
@@ -194,7 +178,7 @@ static int exofs_io_execute(struct exofs_io_state *ios) | |||
194 | 178 | ||
195 | if (sync) { | 179 | if (sync) { |
196 | wait_for_completion(&wait); | 180 | wait_for_completion(&wait); |
197 | ret = exofs_check_io(ios, NULL); | 181 | ret = ore_check_io(ios, NULL); |
198 | } | 182 | } |
199 | return ret; | 183 | return ret; |
200 | } | 184 | } |
@@ -214,7 +198,7 @@ static void _clear_bio(struct bio *bio) | |||
214 | } | 198 | } |
215 | } | 199 | } |
216 | 200 | ||
217 | int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | 201 | int ore_check_io(struct ore_io_state *ios, u64 *resid) |
218 | { | 202 | { |
219 | enum osd_err_priority acumulated_osd_err = 0; | 203 | enum osd_err_priority acumulated_osd_err = 0; |
220 | int acumulated_lin_err = 0; | 204 | int acumulated_lin_err = 0; |
@@ -235,7 +219,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | |||
235 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { | 219 | if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { |
236 | /* start read offset passed endof file */ | 220 | /* start read offset passed endof file */ |
237 | _clear_bio(ios->per_dev[i].bio); | 221 | _clear_bio(ios->per_dev[i].bio); |
238 | EXOFS_DBGMSG("start read offset passed end of file " | 222 | ORE_DBGMSG("start read offset passed end of file " |
239 | "offset=0x%llx, length=0x%llx\n", | 223 | "offset=0x%llx, length=0x%llx\n", |
240 | _LLU(ios->per_dev[i].offset), | 224 | _LLU(ios->per_dev[i].offset), |
241 | _LLU(ios->per_dev[i].length)); | 225 | _LLU(ios->per_dev[i].length)); |
@@ -259,6 +243,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | |||
259 | 243 | ||
260 | return acumulated_lin_err; | 244 | return acumulated_lin_err; |
261 | } | 245 | } |
246 | EXPORT_SYMBOL(ore_check_io); | ||
262 | 247 | ||
263 | /* | 248 | /* |
264 | * L - logical offset into the file | 249 | * L - logical offset into the file |
@@ -305,20 +290,21 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid) | |||
305 | struct _striping_info { | 290 | struct _striping_info { |
306 | u64 obj_offset; | 291 | u64 obj_offset; |
307 | u64 group_length; | 292 | u64 group_length; |
293 | u64 M; /* for truncate */ | ||
308 | unsigned dev; | 294 | unsigned dev; |
309 | unsigned unit_off; | 295 | unsigned unit_off; |
310 | }; | 296 | }; |
311 | 297 | ||
312 | static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, | 298 | static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset, |
313 | struct _striping_info *si) | 299 | struct _striping_info *si) |
314 | { | 300 | { |
315 | u32 stripe_unit = ios->layout->stripe_unit; | 301 | u32 stripe_unit = layout->stripe_unit; |
316 | u32 group_width = ios->layout->group_width; | 302 | u32 group_width = layout->group_width; |
317 | u64 group_depth = ios->layout->group_depth; | 303 | u64 group_depth = layout->group_depth; |
318 | 304 | ||
319 | u32 U = stripe_unit * group_width; | 305 | u32 U = stripe_unit * group_width; |
320 | u64 T = U * group_depth; | 306 | u64 T = U * group_depth; |
321 | u64 S = T * ios->layout->group_count; | 307 | u64 S = T * layout->group_count; |
322 | u64 M = div64_u64(file_offset, S); | 308 | u64 M = div64_u64(file_offset, S); |
323 | 309 | ||
324 | /* | 310 | /* |
@@ -333,7 +319,7 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, | |||
333 | 319 | ||
334 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ | 320 | /* "H - (N * U)" is just "H % U" so it's bound to u32 */ |
335 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; | 321 | si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; |
336 | si->dev *= ios->layout->mirrors_p1; | 322 | si->dev *= layout->mirrors_p1; |
337 | 323 | ||
338 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); | 324 | div_u64_rem(file_offset, stripe_unit, &si->unit_off); |
339 | 325 | ||
@@ -341,15 +327,16 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, | |||
341 | (M * group_depth * stripe_unit); | 327 | (M * group_depth * stripe_unit); |
342 | 328 | ||
343 | si->group_length = T - H; | 329 | si->group_length = T - H; |
330 | si->M = M; | ||
344 | } | 331 | } |
345 | 332 | ||
346 | static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, | 333 | static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, |
347 | unsigned pgbase, struct exofs_per_dev_state *per_dev, | 334 | unsigned pgbase, struct ore_per_dev_state *per_dev, |
348 | int cur_len) | 335 | int cur_len) |
349 | { | 336 | { |
350 | unsigned pg = *cur_pg; | 337 | unsigned pg = *cur_pg; |
351 | struct request_queue *q = | 338 | struct request_queue *q = |
352 | osd_request_queue(exofs_ios_od(ios, per_dev->dev)); | 339 | osd_request_queue(_ios_od(ios, per_dev->dev)); |
353 | 340 | ||
354 | per_dev->length += cur_len; | 341 | per_dev->length += cur_len; |
355 | 342 | ||
@@ -361,7 +348,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, | |||
361 | 348 | ||
362 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); | 349 | per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); |
363 | if (unlikely(!per_dev->bio)) { | 350 | if (unlikely(!per_dev->bio)) { |
364 | EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", | 351 | ORE_DBGMSG("Failed to allocate BIO size=%u\n", |
365 | bio_size); | 352 | bio_size); |
366 | return -ENOMEM; | 353 | return -ENOMEM; |
367 | } | 354 | } |
@@ -387,7 +374,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, | |||
387 | return 0; | 374 | return 0; |
388 | } | 375 | } |
389 | 376 | ||
390 | static int _prepare_one_group(struct exofs_io_state *ios, u64 length, | 377 | static int _prepare_one_group(struct ore_io_state *ios, u64 length, |
391 | struct _striping_info *si) | 378 | struct _striping_info *si) |
392 | { | 379 | { |
393 | unsigned stripe_unit = ios->layout->stripe_unit; | 380 | unsigned stripe_unit = ios->layout->stripe_unit; |
@@ -400,7 +387,7 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length, | |||
400 | int ret = 0; | 387 | int ret = 0; |
401 | 388 | ||
402 | while (length) { | 389 | while (length) { |
403 | struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; | 390 | struct ore_per_dev_state *per_dev = &ios->per_dev[dev]; |
404 | unsigned cur_len, page_off = 0; | 391 | unsigned cur_len, page_off = 0; |
405 | 392 | ||
406 | if (!per_dev->length) { | 393 | if (!per_dev->length) { |
@@ -443,7 +430,7 @@ out: | |||
443 | return ret; | 430 | return ret; |
444 | } | 431 | } |
445 | 432 | ||
446 | static int _prepare_for_striping(struct exofs_io_state *ios) | 433 | static int _prepare_for_striping(struct ore_io_state *ios) |
447 | { | 434 | { |
448 | u64 length = ios->length; | 435 | u64 length = ios->length; |
449 | u64 offset = ios->offset; | 436 | u64 offset = ios->offset; |
@@ -452,9 +439,9 @@ static int _prepare_for_striping(struct exofs_io_state *ios) | |||
452 | 439 | ||
453 | if (!ios->pages) { | 440 | if (!ios->pages) { |
454 | if (ios->kern_buff) { | 441 | if (ios->kern_buff) { |
455 | struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; | 442 | struct ore_per_dev_state *per_dev = &ios->per_dev[0]; |
456 | 443 | ||
457 | _calc_stripe_info(ios, ios->offset, &si); | 444 | _calc_stripe_info(ios->layout, ios->offset, &si); |
458 | per_dev->offset = si.obj_offset; | 445 | per_dev->offset = si.obj_offset; |
459 | per_dev->dev = si.dev; | 446 | per_dev->dev = si.dev; |
460 | 447 | ||
@@ -468,7 +455,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios) | |||
468 | } | 455 | } |
469 | 456 | ||
470 | while (length) { | 457 | while (length) { |
471 | _calc_stripe_info(ios, offset, &si); | 458 | _calc_stripe_info(ios->layout, offset, &si); |
472 | 459 | ||
473 | if (length < si.group_length) | 460 | if (length < si.group_length) |
474 | si.group_length = length; | 461 | si.group_length = length; |
@@ -485,57 +472,59 @@ out: | |||
485 | return ret; | 472 | return ret; |
486 | } | 473 | } |
487 | 474 | ||
488 | int exofs_sbi_create(struct exofs_io_state *ios) | 475 | int ore_create(struct ore_io_state *ios) |
489 | { | 476 | { |
490 | int i, ret; | 477 | int i, ret; |
491 | 478 | ||
492 | for (i = 0; i < ios->layout->s_numdevs; i++) { | 479 | for (i = 0; i < ios->comps->numdevs; i++) { |
493 | struct osd_request *or; | 480 | struct osd_request *or; |
494 | 481 | ||
495 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | 482 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
496 | if (unlikely(!or)) { | 483 | if (unlikely(!or)) { |
497 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 484 | ORE_ERR("%s: osd_start_request failed\n", __func__); |
498 | ret = -ENOMEM; | 485 | ret = -ENOMEM; |
499 | goto out; | 486 | goto out; |
500 | } | 487 | } |
501 | ios->per_dev[i].or = or; | 488 | ios->per_dev[i].or = or; |
502 | ios->numdevs++; | 489 | ios->numdevs++; |
503 | 490 | ||
504 | osd_req_create_object(or, &ios->obj); | 491 | osd_req_create_object(or, _ios_obj(ios, i)); |
505 | } | 492 | } |
506 | ret = exofs_io_execute(ios); | 493 | ret = ore_io_execute(ios); |
507 | 494 | ||
508 | out: | 495 | out: |
509 | return ret; | 496 | return ret; |
510 | } | 497 | } |
498 | EXPORT_SYMBOL(ore_create); | ||
511 | 499 | ||
512 | int exofs_sbi_remove(struct exofs_io_state *ios) | 500 | int ore_remove(struct ore_io_state *ios) |
513 | { | 501 | { |
514 | int i, ret; | 502 | int i, ret; |
515 | 503 | ||
516 | for (i = 0; i < ios->layout->s_numdevs; i++) { | 504 | for (i = 0; i < ios->comps->numdevs; i++) { |
517 | struct osd_request *or; | 505 | struct osd_request *or; |
518 | 506 | ||
519 | or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); | 507 | or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); |
520 | if (unlikely(!or)) { | 508 | if (unlikely(!or)) { |
521 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 509 | ORE_ERR("%s: osd_start_request failed\n", __func__); |
522 | ret = -ENOMEM; | 510 | ret = -ENOMEM; |
523 | goto out; | 511 | goto out; |
524 | } | 512 | } |
525 | ios->per_dev[i].or = or; | 513 | ios->per_dev[i].or = or; |
526 | ios->numdevs++; | 514 | ios->numdevs++; |
527 | 515 | ||
528 | osd_req_remove_object(or, &ios->obj); | 516 | osd_req_remove_object(or, _ios_obj(ios, i)); |
529 | } | 517 | } |
530 | ret = exofs_io_execute(ios); | 518 | ret = ore_io_execute(ios); |
531 | 519 | ||
532 | out: | 520 | out: |
533 | return ret; | 521 | return ret; |
534 | } | 522 | } |
523 | EXPORT_SYMBOL(ore_remove); | ||
535 | 524 | ||
536 | static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) | 525 | static int _write_mirror(struct ore_io_state *ios, int cur_comp) |
537 | { | 526 | { |
538 | struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; | 527 | struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; |
539 | unsigned dev = ios->per_dev[cur_comp].dev; | 528 | unsigned dev = ios->per_dev[cur_comp].dev; |
540 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; | 529 | unsigned last_comp = cur_comp + ios->layout->mirrors_p1; |
541 | int ret = 0; | 530 | int ret = 0; |
@@ -544,12 +533,12 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) | |||
544 | return 0; /* Just an empty slot */ | 533 | return 0; /* Just an empty slot */ |
545 | 534 | ||
546 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { | 535 | for (; cur_comp < last_comp; ++cur_comp, ++dev) { |
547 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | 536 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
548 | struct osd_request *or; | 537 | struct osd_request *or; |
549 | 538 | ||
550 | or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); | 539 | or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); |
551 | if (unlikely(!or)) { | 540 | if (unlikely(!or)) { |
552 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 541 | ORE_ERR("%s: osd_start_request failed\n", __func__); |
553 | ret = -ENOMEM; | 542 | ret = -ENOMEM; |
554 | goto out; | 543 | goto out; |
555 | } | 544 | } |
@@ -563,7 +552,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) | |||
563 | bio = bio_kmalloc(GFP_KERNEL, | 552 | bio = bio_kmalloc(GFP_KERNEL, |
564 | master_dev->bio->bi_max_vecs); | 553 | master_dev->bio->bi_max_vecs); |
565 | if (unlikely(!bio)) { | 554 | if (unlikely(!bio)) { |
566 | EXOFS_DBGMSG( | 555 | ORE_DBGMSG( |
567 | "Failed to allocate BIO size=%u\n", | 556 | "Failed to allocate BIO size=%u\n", |
568 | master_dev->bio->bi_max_vecs); | 557 | master_dev->bio->bi_max_vecs); |
569 | ret = -ENOMEM; | 558 | ret = -ENOMEM; |
@@ -582,25 +571,29 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) | |||
582 | bio->bi_rw |= REQ_WRITE; | 571 | bio->bi_rw |= REQ_WRITE; |
583 | } | 572 | } |
584 | 573 | ||
585 | osd_req_write(or, &ios->obj, per_dev->offset, bio, | 574 | osd_req_write(or, _ios_obj(ios, dev), per_dev->offset, |
586 | per_dev->length); | 575 | bio, per_dev->length); |
587 | EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " | 576 | ORE_DBGMSG("write(0x%llx) offset=0x%llx " |
588 | "length=0x%llx dev=%d\n", | 577 | "length=0x%llx dev=%d\n", |
589 | _LLU(ios->obj.id), _LLU(per_dev->offset), | 578 | _LLU(_ios_obj(ios, dev)->id), |
579 | _LLU(per_dev->offset), | ||
590 | _LLU(per_dev->length), dev); | 580 | _LLU(per_dev->length), dev); |
591 | } else if (ios->kern_buff) { | 581 | } else if (ios->kern_buff) { |
592 | ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, | 582 | ret = osd_req_write_kern(or, _ios_obj(ios, dev), |
593 | ios->kern_buff, ios->length); | 583 | per_dev->offset, |
584 | ios->kern_buff, ios->length); | ||
594 | if (unlikely(ret)) | 585 | if (unlikely(ret)) |
595 | goto out; | 586 | goto out; |
596 | EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " | 587 | ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " |
597 | "length=0x%llx dev=%d\n", | 588 | "length=0x%llx dev=%d\n", |
598 | _LLU(ios->obj.id), _LLU(per_dev->offset), | 589 | _LLU(_ios_obj(ios, dev)->id), |
590 | _LLU(per_dev->offset), | ||
599 | _LLU(ios->length), dev); | 591 | _LLU(ios->length), dev); |
600 | } else { | 592 | } else { |
601 | osd_req_set_attributes(or, &ios->obj); | 593 | osd_req_set_attributes(or, _ios_obj(ios, dev)); |
602 | EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", | 594 | ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", |
603 | _LLU(ios->obj.id), ios->out_attr_len, dev); | 595 | _LLU(_ios_obj(ios, dev)->id), |
596 | ios->out_attr_len, dev); | ||
604 | } | 597 | } |
605 | 598 | ||
606 | if (ios->out_attr) | 599 | if (ios->out_attr) |
@@ -616,7 +609,7 @@ out: | |||
616 | return ret; | 609 | return ret; |
617 | } | 610 | } |
618 | 611 | ||
619 | int exofs_sbi_write(struct exofs_io_state *ios) | 612 | int ore_write(struct ore_io_state *ios) |
620 | { | 613 | { |
621 | int i; | 614 | int i; |
622 | int ret; | 615 | int ret; |
@@ -626,52 +619,55 @@ int exofs_sbi_write(struct exofs_io_state *ios) | |||
626 | return ret; | 619 | return ret; |
627 | 620 | ||
628 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | 621 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { |
629 | ret = _sbi_write_mirror(ios, i); | 622 | ret = _write_mirror(ios, i); |
630 | if (unlikely(ret)) | 623 | if (unlikely(ret)) |
631 | return ret; | 624 | return ret; |
632 | } | 625 | } |
633 | 626 | ||
634 | ret = exofs_io_execute(ios); | 627 | ret = ore_io_execute(ios); |
635 | return ret; | 628 | return ret; |
636 | } | 629 | } |
630 | EXPORT_SYMBOL(ore_write); | ||
637 | 631 | ||
638 | static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) | 632 | static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp) |
639 | { | 633 | { |
640 | struct osd_request *or; | 634 | struct osd_request *or; |
641 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | 635 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
642 | unsigned first_dev = (unsigned)ios->obj.id; | 636 | struct osd_obj_id *obj = _ios_obj(ios, cur_comp); |
637 | unsigned first_dev = (unsigned)obj->id; | ||
643 | 638 | ||
644 | if (ios->pages && !per_dev->length) | 639 | if (ios->pages && !per_dev->length) |
645 | return 0; /* Just an empty slot */ | 640 | return 0; /* Just an empty slot */ |
646 | 641 | ||
647 | first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; | 642 | first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; |
648 | or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); | 643 | or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); |
649 | if (unlikely(!or)) { | 644 | if (unlikely(!or)) { |
650 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 645 | ORE_ERR("%s: osd_start_request failed\n", __func__); |
651 | return -ENOMEM; | 646 | return -ENOMEM; |
652 | } | 647 | } |
653 | per_dev->or = or; | 648 | per_dev->or = or; |
654 | 649 | ||
655 | if (ios->pages) { | 650 | if (ios->pages) { |
656 | osd_req_read(or, &ios->obj, per_dev->offset, | 651 | osd_req_read(or, obj, per_dev->offset, |
657 | per_dev->bio, per_dev->length); | 652 | per_dev->bio, per_dev->length); |
658 | EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" | 653 | ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" |
659 | " dev=%d\n", _LLU(ios->obj.id), | 654 | " dev=%d\n", _LLU(obj->id), |
660 | _LLU(per_dev->offset), _LLU(per_dev->length), | 655 | _LLU(per_dev->offset), _LLU(per_dev->length), |
661 | first_dev); | 656 | first_dev); |
662 | } else if (ios->kern_buff) { | 657 | } else if (ios->kern_buff) { |
663 | int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, | 658 | int ret = osd_req_read_kern(or, obj, per_dev->offset, |
664 | ios->kern_buff, ios->length); | 659 | ios->kern_buff, ios->length); |
665 | EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | 660 | ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx " |
666 | "length=0x%llx dev=%d ret=>%d\n", | 661 | "length=0x%llx dev=%d ret=>%d\n", |
667 | _LLU(ios->obj.id), _LLU(per_dev->offset), | 662 | _LLU(obj->id), _LLU(per_dev->offset), |
668 | _LLU(ios->length), first_dev, ret); | 663 | _LLU(ios->length), first_dev, ret); |
669 | if (unlikely(ret)) | 664 | if (unlikely(ret)) |
670 | return ret; | 665 | return ret; |
671 | } else { | 666 | } else { |
672 | osd_req_get_attributes(or, &ios->obj); | 667 | osd_req_get_attributes(or, obj); |
673 | EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", | 668 | ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", |
674 | _LLU(ios->obj.id), ios->in_attr_len, first_dev); | 669 | _LLU(obj->id), |
670 | ios->in_attr_len, first_dev); | ||
675 | } | 671 | } |
676 | if (ios->out_attr) | 672 | if (ios->out_attr) |
677 | osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); | 673 | osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); |
@@ -682,7 +678,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) | |||
682 | return 0; | 678 | return 0; |
683 | } | 679 | } |
684 | 680 | ||
685 | int exofs_sbi_read(struct exofs_io_state *ios) | 681 | int ore_read(struct ore_io_state *ios) |
686 | { | 682 | { |
687 | int i; | 683 | int i; |
688 | int ret; | 684 | int ret; |
@@ -692,16 +688,17 @@ int exofs_sbi_read(struct exofs_io_state *ios) | |||
692 | return ret; | 688 | return ret; |
693 | 689 | ||
694 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { | 690 | for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { |
695 | ret = _sbi_read_mirror(ios, i); | 691 | ret = _read_mirror(ios, i); |
696 | if (unlikely(ret)) | 692 | if (unlikely(ret)) |
697 | return ret; | 693 | return ret; |
698 | } | 694 | } |
699 | 695 | ||
700 | ret = exofs_io_execute(ios); | 696 | ret = ore_io_execute(ios); |
701 | return ret; | 697 | return ret; |
702 | } | 698 | } |
699 | EXPORT_SYMBOL(ore_read); | ||
703 | 700 | ||
704 | int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) | 701 | int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) |
705 | { | 702 | { |
706 | struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ | 703 | struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ |
707 | void *iter = NULL; | 704 | void *iter = NULL; |
@@ -721,83 +718,118 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) | |||
721 | 718 | ||
722 | return -EIO; | 719 | return -EIO; |
723 | } | 720 | } |
721 | EXPORT_SYMBOL(extract_attr_from_ios); | ||
724 | 722 | ||
725 | static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, | 723 | static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, |
726 | struct osd_attr *attr) | 724 | struct osd_attr *attr) |
727 | { | 725 | { |
728 | int last_comp = cur_comp + ios->layout->mirrors_p1; | 726 | int last_comp = cur_comp + ios->layout->mirrors_p1; |
729 | 727 | ||
730 | for (; cur_comp < last_comp; ++cur_comp) { | 728 | for (; cur_comp < last_comp; ++cur_comp) { |
731 | struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; | 729 | struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; |
732 | struct osd_request *or; | 730 | struct osd_request *or; |
733 | 731 | ||
734 | or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); | 732 | or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); |
735 | if (unlikely(!or)) { | 733 | if (unlikely(!or)) { |
736 | EXOFS_ERR("%s: osd_start_request failed\n", __func__); | 734 | ORE_ERR("%s: osd_start_request failed\n", __func__); |
737 | return -ENOMEM; | 735 | return -ENOMEM; |
738 | } | 736 | } |
739 | per_dev->or = or; | 737 | per_dev->or = or; |
740 | 738 | ||
741 | osd_req_set_attributes(or, &ios->obj); | 739 | osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); |
742 | osd_req_add_set_attr_list(or, attr, 1); | 740 | osd_req_add_set_attr_list(or, attr, 1); |
743 | } | 741 | } |
744 | 742 | ||
745 | return 0; | 743 | return 0; |
746 | } | 744 | } |
747 | 745 | ||
748 | int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) | 746 | struct _trunc_info { |
747 | struct _striping_info si; | ||
748 | u64 prev_group_obj_off; | ||
749 | u64 next_group_obj_off; | ||
750 | |||
751 | unsigned first_group_dev; | ||
752 | unsigned nex_group_dev; | ||
753 | unsigned max_devs; | ||
754 | }; | ||
755 | |||
756 | void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, | ||
757 | struct _trunc_info *ti) | ||
758 | { | ||
759 | unsigned stripe_unit = layout->stripe_unit; | ||
760 | |||
761 | _calc_stripe_info(layout, file_offset, &ti->si); | ||
762 | |||
763 | ti->prev_group_obj_off = ti->si.M * stripe_unit; | ||
764 | ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; | ||
765 | |||
766 | ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); | ||
767 | ti->nex_group_dev = ti->first_group_dev + layout->group_width; | ||
768 | ti->max_devs = layout->group_width * layout->group_count; | ||
769 | } | ||
770 | |||
771 | int ore_truncate(struct ore_layout *layout, struct ore_components *comps, | ||
772 | u64 size) | ||
749 | { | 773 | { |
750 | struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; | 774 | struct ore_io_state *ios; |
751 | struct exofs_io_state *ios; | ||
752 | struct exofs_trunc_attr { | 775 | struct exofs_trunc_attr { |
753 | struct osd_attr attr; | 776 | struct osd_attr attr; |
754 | __be64 newsize; | 777 | __be64 newsize; |
755 | } *size_attrs; | 778 | } *size_attrs; |
756 | struct _striping_info si; | 779 | struct _trunc_info ti; |
757 | int i, ret; | 780 | int i, ret; |
758 | 781 | ||
759 | ret = exofs_get_io_state(&sbi->layout, &ios); | 782 | ret = ore_get_io_state(layout, comps, &ios); |
760 | if (unlikely(ret)) | 783 | if (unlikely(ret)) |
761 | return ret; | 784 | return ret; |
762 | 785 | ||
763 | size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), | 786 | _calc_trunk_info(ios->layout, size, &ti); |
787 | |||
788 | size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs), | ||
764 | GFP_KERNEL); | 789 | GFP_KERNEL); |
765 | if (unlikely(!size_attrs)) { | 790 | if (unlikely(!size_attrs)) { |
766 | ret = -ENOMEM; | 791 | ret = -ENOMEM; |
767 | goto out; | 792 | goto out; |
768 | } | 793 | } |
769 | 794 | ||
770 | ios->obj.id = exofs_oi_objno(oi); | 795 | ios->numdevs = ios->comps->numdevs; |
771 | ios->cred = oi->i_cred; | ||
772 | 796 | ||
773 | ios->numdevs = ios->layout->s_numdevs; | 797 | for (i = 0; i < ti.max_devs; ++i) { |
774 | _calc_stripe_info(ios, size, &si); | ||
775 | |||
776 | for (i = 0; i < ios->layout->group_width; ++i) { | ||
777 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; | 798 | struct exofs_trunc_attr *size_attr = &size_attrs[i]; |
778 | u64 obj_size; | 799 | u64 obj_size; |
779 | 800 | ||
780 | if (i < si.dev) | 801 | if (i < ti.first_group_dev) |
781 | obj_size = si.obj_offset + | 802 | obj_size = ti.prev_group_obj_off; |
782 | ios->layout->stripe_unit - si.unit_off; | 803 | else if (i >= ti.nex_group_dev) |
783 | else if (i == si.dev) | 804 | obj_size = ti.next_group_obj_off; |
784 | obj_size = si.obj_offset; | 805 | else if (i < ti.si.dev) /* dev within this group */ |
785 | else /* i > si.dev */ | 806 | obj_size = ti.si.obj_offset + |
786 | obj_size = si.obj_offset - si.unit_off; | 807 | ios->layout->stripe_unit - ti.si.unit_off; |
808 | else if (i == ti.si.dev) | ||
809 | obj_size = ti.si.obj_offset; | ||
810 | else /* i > ti.dev */ | ||
811 | obj_size = ti.si.obj_offset - ti.si.unit_off; | ||
787 | 812 | ||
788 | size_attr->newsize = cpu_to_be64(obj_size); | 813 | size_attr->newsize = cpu_to_be64(obj_size); |
789 | size_attr->attr = g_attr_logical_length; | 814 | size_attr->attr = g_attr_logical_length; |
790 | size_attr->attr.val_ptr = &size_attr->newsize; | 815 | size_attr->attr.val_ptr = &size_attr->newsize; |
791 | 816 | ||
817 | ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", | ||
818 | _LLU(comps->comps->obj.id), _LLU(obj_size), i); | ||
792 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, | 819 | ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, |
793 | &size_attr->attr); | 820 | &size_attr->attr); |
794 | if (unlikely(ret)) | 821 | if (unlikely(ret)) |
795 | goto out; | 822 | goto out; |
796 | } | 823 | } |
797 | ret = exofs_io_execute(ios); | 824 | ret = ore_io_execute(ios); |
798 | 825 | ||
799 | out: | 826 | out: |
800 | kfree(size_attrs); | 827 | kfree(size_attrs); |
801 | exofs_put_io_state(ios); | 828 | ore_put_io_state(ios); |
802 | return ret; | 829 | return ret; |
803 | } | 830 | } |
831 | EXPORT_SYMBOL(ore_truncate); | ||
832 | |||
833 | const struct osd_attr g_attr_logical_length = ATTR_DEF( | ||
834 | OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); | ||
835 | EXPORT_SYMBOL(g_attr_logical_length); | ||
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h deleted file mode 100644 index c52e9888b8ab..000000000000 --- a/fs/exofs/pnfs.h +++ /dev/null | |||
@@ -1,45 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008, 2009 | ||
3 | * Boaz Harrosh <bharrosh@panasas.com> | ||
4 | * | ||
5 | * This file is part of exofs. | ||
6 | * | ||
7 | * exofs is free software; you can redistribute it and/or modify it under the | ||
8 | * terms of the GNU General Public License version 2 as published by the Free | ||
9 | * Software Foundation. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | /* FIXME: Remove this file once pnfs hits mainline */ | ||
14 | |||
15 | #ifndef __EXOFS_PNFS_H__ | ||
16 | #define __EXOFS_PNFS_H__ | ||
17 | |||
18 | #if ! defined(__PNFS_OSD_XDR_H__) | ||
19 | |||
20 | enum pnfs_iomode { | ||
21 | IOMODE_READ = 1, | ||
22 | IOMODE_RW = 2, | ||
23 | IOMODE_ANY = 3, | ||
24 | }; | ||
25 | |||
26 | /* Layout Structure */ | ||
27 | enum pnfs_osd_raid_algorithm4 { | ||
28 | PNFS_OSD_RAID_0 = 1, | ||
29 | PNFS_OSD_RAID_4 = 2, | ||
30 | PNFS_OSD_RAID_5 = 3, | ||
31 | PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ | ||
32 | }; | ||
33 | |||
34 | struct pnfs_osd_data_map { | ||
35 | u32 odm_num_comps; | ||
36 | u64 odm_stripe_unit; | ||
37 | u32 odm_group_width; | ||
38 | u32 odm_group_depth; | ||
39 | u32 odm_mirror_cnt; | ||
40 | u32 odm_raid_algorithm; | ||
41 | }; | ||
42 | |||
43 | #endif /* ! defined(__PNFS_OSD_XDR_H__) */ | ||
44 | |||
45 | #endif /* __EXOFS_PNFS_H__ */ | ||
diff --git a/fs/exofs/super.c b/fs/exofs/super.c index c57beddcc217..274894053b02 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c | |||
@@ -40,6 +40,8 @@ | |||
40 | 40 | ||
41 | #include "exofs.h" | 41 | #include "exofs.h" |
42 | 42 | ||
43 | #define EXOFS_DBGMSG2(M...) do {} while (0) | ||
44 | |||
43 | /****************************************************************************** | 45 | /****************************************************************************** |
44 | * MOUNT OPTIONS | 46 | * MOUNT OPTIONS |
45 | *****************************************************************************/ | 47 | *****************************************************************************/ |
@@ -208,10 +210,48 @@ static void destroy_inodecache(void) | |||
208 | } | 210 | } |
209 | 211 | ||
210 | /****************************************************************************** | 212 | /****************************************************************************** |
211 | * SUPERBLOCK FUNCTIONS | 213 | * Some osd helpers |
212 | *****************************************************************************/ | 214 | *****************************************************************************/ |
213 | static const struct super_operations exofs_sops; | 215 | void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) |
214 | static const struct export_operations exofs_export_ops; | 216 | { |
217 | osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); | ||
218 | } | ||
219 | |||
220 | static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, | ||
221 | u64 offset, void *p, unsigned length) | ||
222 | { | ||
223 | struct osd_request *or = osd_start_request(od, GFP_KERNEL); | ||
224 | /* struct osd_sense_info osi = {.key = 0};*/ | ||
225 | int ret; | ||
226 | |||
227 | if (unlikely(!or)) { | ||
228 | EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); | ||
229 | return -ENOMEM; | ||
230 | } | ||
231 | ret = osd_req_read_kern(or, obj, offset, p, length); | ||
232 | if (unlikely(ret)) { | ||
233 | EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); | ||
234 | goto out; | ||
235 | } | ||
236 | |||
237 | ret = osd_finalize_request(or, 0, cred, NULL); | ||
238 | if (unlikely(ret)) { | ||
239 | EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); | ||
240 | goto out; | ||
241 | } | ||
242 | |||
243 | ret = osd_execute_request(or); | ||
244 | if (unlikely(ret)) | ||
245 | EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); | ||
246 | /* osd_req_decode_sense(or, ret); */ | ||
247 | |||
248 | out: | ||
249 | osd_end_request(or); | ||
250 | EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " | ||
251 | "length=0x%llx dev=%p ret=>%d\n", | ||
252 | _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); | ||
253 | return ret; | ||
254 | } | ||
215 | 255 | ||
216 | static const struct osd_attr g_attr_sb_stats = ATTR_DEF( | 256 | static const struct osd_attr g_attr_sb_stats = ATTR_DEF( |
217 | EXOFS_APAGE_SB_DATA, | 257 | EXOFS_APAGE_SB_DATA, |
@@ -223,21 +263,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) | |||
223 | struct osd_attr attrs[] = { | 263 | struct osd_attr attrs[] = { |
224 | [0] = g_attr_sb_stats, | 264 | [0] = g_attr_sb_stats, |
225 | }; | 265 | }; |
226 | struct exofs_io_state *ios; | 266 | struct ore_io_state *ios; |
227 | int ret; | 267 | int ret; |
228 | 268 | ||
229 | ret = exofs_get_io_state(&sbi->layout, &ios); | 269 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); |
230 | if (unlikely(ret)) { | 270 | if (unlikely(ret)) { |
231 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | 271 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
232 | return ret; | 272 | return ret; |
233 | } | 273 | } |
234 | 274 | ||
235 | ios->cred = sbi->s_cred; | ||
236 | |||
237 | ios->in_attr = attrs; | 275 | ios->in_attr = attrs; |
238 | ios->in_attr_len = ARRAY_SIZE(attrs); | 276 | ios->in_attr_len = ARRAY_SIZE(attrs); |
239 | 277 | ||
240 | ret = exofs_sbi_read(ios); | 278 | ret = ore_read(ios); |
241 | if (unlikely(ret)) { | 279 | if (unlikely(ret)) { |
242 | EXOFS_ERR("Error reading super_block stats => %d\n", ret); | 280 | EXOFS_ERR("Error reading super_block stats => %d\n", ret); |
243 | goto out; | 281 | goto out; |
@@ -264,13 +302,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi) | |||
264 | } | 302 | } |
265 | 303 | ||
266 | out: | 304 | out: |
267 | exofs_put_io_state(ios); | 305 | ore_put_io_state(ios); |
268 | return ret; | 306 | return ret; |
269 | } | 307 | } |
270 | 308 | ||
271 | static void stats_done(struct exofs_io_state *ios, void *p) | 309 | static void stats_done(struct ore_io_state *ios, void *p) |
272 | { | 310 | { |
273 | exofs_put_io_state(ios); | 311 | ore_put_io_state(ios); |
274 | /* Good thanks nothing to do anymore */ | 312 | /* Good thanks nothing to do anymore */ |
275 | } | 313 | } |
276 | 314 | ||
@@ -280,12 +318,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) | |||
280 | struct osd_attr attrs[] = { | 318 | struct osd_attr attrs[] = { |
281 | [0] = g_attr_sb_stats, | 319 | [0] = g_attr_sb_stats, |
282 | }; | 320 | }; |
283 | struct exofs_io_state *ios; | 321 | struct ore_io_state *ios; |
284 | int ret; | 322 | int ret; |
285 | 323 | ||
286 | ret = exofs_get_io_state(&sbi->layout, &ios); | 324 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); |
287 | if (unlikely(ret)) { | 325 | if (unlikely(ret)) { |
288 | EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); | 326 | EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); |
289 | return ret; | 327 | return ret; |
290 | } | 328 | } |
291 | 329 | ||
@@ -293,21 +331,27 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi) | |||
293 | sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); | 331 | sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); |
294 | attrs[0].val_ptr = &sbi->s_ess; | 332 | attrs[0].val_ptr = &sbi->s_ess; |
295 | 333 | ||
296 | ios->cred = sbi->s_cred; | 334 | |
297 | ios->done = stats_done; | 335 | ios->done = stats_done; |
298 | ios->private = sbi; | 336 | ios->private = sbi; |
299 | ios->out_attr = attrs; | 337 | ios->out_attr = attrs; |
300 | ios->out_attr_len = ARRAY_SIZE(attrs); | 338 | ios->out_attr_len = ARRAY_SIZE(attrs); |
301 | 339 | ||
302 | ret = exofs_sbi_write(ios); | 340 | ret = ore_write(ios); |
303 | if (unlikely(ret)) { | 341 | if (unlikely(ret)) { |
304 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); | 342 | EXOFS_ERR("%s: ore_write failed.\n", __func__); |
305 | exofs_put_io_state(ios); | 343 | ore_put_io_state(ios); |
306 | } | 344 | } |
307 | 345 | ||
308 | return ret; | 346 | return ret; |
309 | } | 347 | } |
310 | 348 | ||
349 | /****************************************************************************** | ||
350 | * SUPERBLOCK FUNCTIONS | ||
351 | *****************************************************************************/ | ||
352 | static const struct super_operations exofs_sops; | ||
353 | static const struct export_operations exofs_export_ops; | ||
354 | |||
311 | /* | 355 | /* |
312 | * Write the superblock to the OSD | 356 | * Write the superblock to the OSD |
313 | */ | 357 | */ |
@@ -315,7 +359,9 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
315 | { | 359 | { |
316 | struct exofs_sb_info *sbi; | 360 | struct exofs_sb_info *sbi; |
317 | struct exofs_fscb *fscb; | 361 | struct exofs_fscb *fscb; |
318 | struct exofs_io_state *ios; | 362 | struct ore_comp one_comp; |
363 | struct ore_components comps; | ||
364 | struct ore_io_state *ios; | ||
319 | int ret = -ENOMEM; | 365 | int ret = -ENOMEM; |
320 | 366 | ||
321 | fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); | 367 | fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); |
@@ -331,7 +377,10 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
331 | * version). Otherwise the exofs_fscb is read-only from mkfs time. All | 377 | * version). Otherwise the exofs_fscb is read-only from mkfs time. All |
332 | * the writeable info is set in exofs_sbi_write_stats() above. | 378 | * the writeable info is set in exofs_sbi_write_stats() above. |
333 | */ | 379 | */ |
334 | ret = exofs_get_io_state(&sbi->layout, &ios); | 380 | |
381 | exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID); | ||
382 | |||
383 | ret = ore_get_io_state(&sbi->layout, &comps, &ios); | ||
335 | if (unlikely(ret)) | 384 | if (unlikely(ret)) |
336 | goto out; | 385 | goto out; |
337 | 386 | ||
@@ -345,14 +394,12 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
345 | fscb->s_newfs = 0; | 394 | fscb->s_newfs = 0; |
346 | fscb->s_version = EXOFS_FSCB_VER; | 395 | fscb->s_version = EXOFS_FSCB_VER; |
347 | 396 | ||
348 | ios->obj.id = EXOFS_SUPER_ID; | ||
349 | ios->offset = 0; | 397 | ios->offset = 0; |
350 | ios->kern_buff = fscb; | 398 | ios->kern_buff = fscb; |
351 | ios->cred = sbi->s_cred; | ||
352 | 399 | ||
353 | ret = exofs_sbi_write(ios); | 400 | ret = ore_write(ios); |
354 | if (unlikely(ret)) | 401 | if (unlikely(ret)) |
355 | EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); | 402 | EXOFS_ERR("%s: ore_write failed.\n", __func__); |
356 | else | 403 | else |
357 | sb->s_dirt = 0; | 404 | sb->s_dirt = 0; |
358 | 405 | ||
@@ -360,7 +407,7 @@ int exofs_sync_fs(struct super_block *sb, int wait) | |||
360 | unlock_super(sb); | 407 | unlock_super(sb); |
361 | out: | 408 | out: |
362 | EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); | 409 | EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); |
363 | exofs_put_io_state(ios); | 410 | ore_put_io_state(ios); |
364 | kfree(fscb); | 411 | kfree(fscb); |
365 | return ret; | 412 | return ret; |
366 | } | 413 | } |
@@ -384,15 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path, | |||
384 | 431 | ||
385 | void exofs_free_sbi(struct exofs_sb_info *sbi) | 432 | void exofs_free_sbi(struct exofs_sb_info *sbi) |
386 | { | 433 | { |
387 | while (sbi->layout.s_numdevs) { | 434 | while (sbi->comps.numdevs) { |
388 | int i = --sbi->layout.s_numdevs; | 435 | int i = --sbi->comps.numdevs; |
389 | struct osd_dev *od = sbi->layout.s_ods[i]; | 436 | struct osd_dev *od = sbi->comps.ods[i]; |
390 | 437 | ||
391 | if (od) { | 438 | if (od) { |
392 | sbi->layout.s_ods[i] = NULL; | 439 | sbi->comps.ods[i] = NULL; |
393 | osduld_put_device(od); | 440 | osduld_put_device(od); |
394 | } | 441 | } |
395 | } | 442 | } |
443 | if (sbi->comps.ods != sbi->_min_one_dev) | ||
444 | kfree(sbi->comps.ods); | ||
396 | kfree(sbi); | 445 | kfree(sbi); |
397 | } | 446 | } |
398 | 447 | ||
@@ -419,8 +468,8 @@ static void exofs_put_super(struct super_block *sb) | |||
419 | msecs_to_jiffies(100)); | 468 | msecs_to_jiffies(100)); |
420 | } | 469 | } |
421 | 470 | ||
422 | _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], | 471 | _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0], |
423 | sbi->layout.s_pid); | 472 | sbi->one_comp.obj.partition); |
424 | 473 | ||
425 | bdi_destroy(&sbi->bdi); | 474 | bdi_destroy(&sbi->bdi); |
426 | exofs_free_sbi(sbi); | 475 | exofs_free_sbi(sbi); |
@@ -501,10 +550,19 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, | |||
501 | return -EINVAL; | 550 | return -EINVAL; |
502 | } | 551 | } |
503 | 552 | ||
553 | EXOFS_DBGMSG("exofs: layout: " | ||
554 | "num_comps=%u stripe_unit=0x%x group_width=%u " | ||
555 | "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", | ||
556 | numdevs, | ||
557 | sbi->layout.stripe_unit, | ||
558 | sbi->layout.group_width, | ||
559 | _LLU(sbi->layout.group_depth), | ||
560 | sbi->layout.mirrors_p1, | ||
561 | sbi->data_map.odm_raid_algorithm); | ||
504 | return 0; | 562 | return 0; |
505 | } | 563 | } |
506 | 564 | ||
507 | static unsigned __ra_pages(struct exofs_layout *layout) | 565 | static unsigned __ra_pages(struct ore_layout *layout) |
508 | { | 566 | { |
509 | const unsigned _MIN_RA = 32; /* min 128K read-ahead */ | 567 | const unsigned _MIN_RA = 32; /* min 128K read-ahead */ |
510 | unsigned ra_pages = layout->group_width * layout->stripe_unit / | 568 | unsigned ra_pages = layout->group_width * layout->stripe_unit / |
@@ -547,13 +605,11 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, | |||
547 | return !(odi->systemid_len || odi->osdname_len); | 605 | return !(odi->systemid_len || odi->osdname_len); |
548 | } | 606 | } |
549 | 607 | ||
550 | static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | 608 | static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, |
609 | struct osd_dev *fscb_od, | ||
551 | unsigned table_count) | 610 | unsigned table_count) |
552 | { | 611 | { |
553 | struct exofs_sb_info *sbi = *psbi; | 612 | struct ore_comp comp; |
554 | struct osd_dev *fscb_od; | ||
555 | struct osd_obj_id obj = {.partition = sbi->layout.s_pid, | ||
556 | .id = EXOFS_DEVTABLE_ID}; | ||
557 | struct exofs_device_table *dt; | 613 | struct exofs_device_table *dt; |
558 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + | 614 | unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + |
559 | sizeof(*dt); | 615 | sizeof(*dt); |
@@ -567,10 +623,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | |||
567 | return -ENOMEM; | 623 | return -ENOMEM; |
568 | } | 624 | } |
569 | 625 | ||
570 | fscb_od = sbi->layout.s_ods[0]; | 626 | sbi->comps.numdevs = 0; |
571 | sbi->layout.s_ods[0] = NULL; | 627 | |
572 | sbi->layout.s_numdevs = 0; | 628 | comp.obj.partition = sbi->one_comp.obj.partition; |
573 | ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); | 629 | comp.obj.id = EXOFS_DEVTABLE_ID; |
630 | exofs_make_credential(comp.cred, &comp.obj); | ||
631 | |||
632 | ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, | ||
633 | table_bytes); | ||
574 | if (unlikely(ret)) { | 634 | if (unlikely(ret)) { |
575 | EXOFS_ERR("ERROR: reading device table\n"); | 635 | EXOFS_ERR("ERROR: reading device table\n"); |
576 | goto out; | 636 | goto out; |
@@ -588,16 +648,18 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | |||
588 | goto out; | 648 | goto out; |
589 | 649 | ||
590 | if (likely(numdevs > 1)) { | 650 | if (likely(numdevs > 1)) { |
591 | unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); | 651 | unsigned size = numdevs * sizeof(sbi->comps.ods[0]); |
592 | 652 | ||
593 | sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); | 653 | /* Twice bigger table: See exofs_init_comps() and below |
594 | if (unlikely(!sbi)) { | 654 | * comment |
655 | */ | ||
656 | sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL); | ||
657 | if (unlikely(!sbi->comps.ods)) { | ||
658 | EXOFS_ERR("ERROR: faild allocating Device array[%d]\n", | ||
659 | numdevs); | ||
595 | ret = -ENOMEM; | 660 | ret = -ENOMEM; |
596 | goto out; | 661 | goto out; |
597 | } | 662 | } |
598 | memset(&sbi->layout.s_ods[1], 0, | ||
599 | size - sizeof(sbi->layout.s_ods[0])); | ||
600 | *psbi = sbi; | ||
601 | } | 663 | } |
602 | 664 | ||
603 | for (i = 0; i < numdevs; i++) { | 665 | for (i = 0; i < numdevs; i++) { |
@@ -619,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | |||
619 | * line. We always keep them in device-table order. | 681 | * line. We always keep them in device-table order. |
620 | */ | 682 | */ |
621 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { | 683 | if (fscb_od && osduld_device_same(fscb_od, &odi)) { |
622 | sbi->layout.s_ods[i] = fscb_od; | 684 | sbi->comps.ods[i] = fscb_od; |
623 | ++sbi->layout.s_numdevs; | 685 | ++sbi->comps.numdevs; |
624 | fscb_od = NULL; | 686 | fscb_od = NULL; |
625 | continue; | 687 | continue; |
626 | } | 688 | } |
@@ -633,13 +695,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | |||
633 | goto out; | 695 | goto out; |
634 | } | 696 | } |
635 | 697 | ||
636 | sbi->layout.s_ods[i] = od; | 698 | sbi->comps.ods[i] = od; |
637 | ++sbi->layout.s_numdevs; | 699 | ++sbi->comps.numdevs; |
638 | 700 | ||
639 | /* Read the fscb of the other devices to make sure the FS | 701 | /* Read the fscb of the other devices to make sure the FS |
640 | * partition is there. | 702 | * partition is there. |
641 | */ | 703 | */ |
642 | ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, | 704 | ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, |
643 | sizeof(fscb)); | 705 | sizeof(fscb)); |
644 | if (unlikely(ret)) { | 706 | if (unlikely(ret)) { |
645 | EXOFS_ERR("ERROR: Malformed participating device " | 707 | EXOFS_ERR("ERROR: Malformed participating device " |
@@ -656,13 +718,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, | |||
656 | 718 | ||
657 | out: | 719 | out: |
658 | kfree(dt); | 720 | kfree(dt); |
659 | if (unlikely(!ret && fscb_od)) { | 721 | if (likely(!ret)) { |
660 | EXOFS_ERR( | 722 | unsigned numdevs = sbi->comps.numdevs; |
661 | "ERROR: Bad device-table container device not present\n"); | ||
662 | osduld_put_device(fscb_od); | ||
663 | ret = -EINVAL; | ||
664 | } | ||
665 | 723 | ||
724 | if (unlikely(fscb_od)) { | ||
725 | EXOFS_ERR("ERROR: Bad device-table container device not present\n"); | ||
726 | osduld_put_device(fscb_od); | ||
727 | return -EINVAL; | ||
728 | } | ||
729 | /* exofs round-robins the device table view according to inode | ||
730 | * number. We hold a: twice bigger table hence inodes can point | ||
731 | * to any device and have a sequential view of the table | ||
732 | * starting at this device. See exofs_init_comps() | ||
733 | */ | ||
734 | for (i = 0; i < numdevs - 1; ++i) | ||
735 | sbi->comps.ods[i + numdevs] = sbi->comps.ods[i]; | ||
736 | } | ||
666 | return ret; | 737 | return ret; |
667 | } | 738 | } |
668 | 739 | ||
@@ -676,7 +747,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
676 | struct exofs_sb_info *sbi; /*extended info */ | 747 | struct exofs_sb_info *sbi; /*extended info */ |
677 | struct osd_dev *od; /* Master device */ | 748 | struct osd_dev *od; /* Master device */ |
678 | struct exofs_fscb fscb; /*on-disk superblock info */ | 749 | struct exofs_fscb fscb; /*on-disk superblock info */ |
679 | struct osd_obj_id obj; | 750 | struct ore_comp comp; |
680 | unsigned table_count; | 751 | unsigned table_count; |
681 | int ret; | 752 | int ret; |
682 | 753 | ||
@@ -684,10 +755,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
684 | if (!sbi) | 755 | if (!sbi) |
685 | return -ENOMEM; | 756 | return -ENOMEM; |
686 | 757 | ||
687 | ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); | ||
688 | if (ret) | ||
689 | goto free_bdi; | ||
690 | |||
691 | /* use mount options to fill superblock */ | 758 | /* use mount options to fill superblock */ |
692 | if (opts->is_osdname) { | 759 | if (opts->is_osdname) { |
693 | struct osd_dev_info odi = {.systemid_len = 0}; | 760 | struct osd_dev_info odi = {.systemid_len = 0}; |
@@ -695,6 +762,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
695 | odi.osdname_len = strlen(opts->dev_name); | 762 | odi.osdname_len = strlen(opts->dev_name); |
696 | odi.osdname = (u8 *)opts->dev_name; | 763 | odi.osdname = (u8 *)opts->dev_name; |
697 | od = osduld_info_lookup(&odi); | 764 | od = osduld_info_lookup(&odi); |
765 | kfree(opts->dev_name); | ||
766 | opts->dev_name = NULL; | ||
698 | } else { | 767 | } else { |
699 | od = osduld_path_lookup(opts->dev_name); | 768 | od = osduld_path_lookup(opts->dev_name); |
700 | } | 769 | } |
@@ -709,11 +778,16 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
709 | sbi->layout.group_width = 1; | 778 | sbi->layout.group_width = 1; |
710 | sbi->layout.group_depth = -1; | 779 | sbi->layout.group_depth = -1; |
711 | sbi->layout.group_count = 1; | 780 | sbi->layout.group_count = 1; |
712 | sbi->layout.s_ods[0] = od; | ||
713 | sbi->layout.s_numdevs = 1; | ||
714 | sbi->layout.s_pid = opts->pid; | ||
715 | sbi->s_timeout = opts->timeout; | 781 | sbi->s_timeout = opts->timeout; |
716 | 782 | ||
783 | sbi->one_comp.obj.partition = opts->pid; | ||
784 | sbi->one_comp.obj.id = 0; | ||
785 | exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); | ||
786 | sbi->comps.numdevs = 1; | ||
787 | sbi->comps.single_comp = EC_SINGLE_COMP; | ||
788 | sbi->comps.comps = &sbi->one_comp; | ||
789 | sbi->comps.ods = sbi->_min_one_dev; | ||
790 | |||
717 | /* fill in some other data by hand */ | 791 | /* fill in some other data by hand */ |
718 | memset(sb->s_id, 0, sizeof(sb->s_id)); | 792 | memset(sb->s_id, 0, sizeof(sb->s_id)); |
719 | strcpy(sb->s_id, "exofs"); | 793 | strcpy(sb->s_id, "exofs"); |
@@ -724,11 +798,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
724 | sb->s_bdev = NULL; | 798 | sb->s_bdev = NULL; |
725 | sb->s_dev = 0; | 799 | sb->s_dev = 0; |
726 | 800 | ||
727 | obj.partition = sbi->layout.s_pid; | 801 | comp.obj.partition = sbi->one_comp.obj.partition; |
728 | obj.id = EXOFS_SUPER_ID; | 802 | comp.obj.id = EXOFS_SUPER_ID; |
729 | exofs_make_credential(sbi->s_cred, &obj); | 803 | exofs_make_credential(comp.cred, &comp.obj); |
730 | 804 | ||
731 | ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); | 805 | ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); |
732 | if (unlikely(ret)) | 806 | if (unlikely(ret)) |
733 | goto free_sbi; | 807 | goto free_sbi; |
734 | 808 | ||
@@ -757,9 +831,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
757 | 831 | ||
758 | table_count = le64_to_cpu(fscb.s_dev_table_count); | 832 | table_count = le64_to_cpu(fscb.s_dev_table_count); |
759 | if (table_count) { | 833 | if (table_count) { |
760 | ret = exofs_read_lookup_dev_table(&sbi, table_count); | 834 | ret = exofs_read_lookup_dev_table(sbi, od, table_count); |
761 | if (unlikely(ret)) | 835 | if (unlikely(ret)) |
762 | goto free_sbi; | 836 | goto free_sbi; |
837 | } else { | ||
838 | sbi->comps.ods[0] = od; | ||
763 | } | 839 | } |
764 | 840 | ||
765 | __sbi_read_stats(sbi); | 841 | __sbi_read_stats(sbi); |
@@ -793,20 +869,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) | |||
793 | goto free_sbi; | 869 | goto free_sbi; |
794 | } | 870 | } |
795 | 871 | ||
796 | _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], | 872 | ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); |
797 | sbi->layout.s_pid); | 873 | if (ret) { |
798 | if (opts->is_osdname) | 874 | EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); |
799 | kfree(opts->dev_name); | 875 | goto free_sbi; |
876 | } | ||
877 | |||
878 | _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0], | ||
879 | sbi->one_comp.obj.partition); | ||
800 | return 0; | 880 | return 0; |
801 | 881 | ||
802 | free_sbi: | 882 | free_sbi: |
803 | bdi_destroy(&sbi->bdi); | ||
804 | free_bdi: | ||
805 | EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", | 883 | EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", |
806 | opts->dev_name, sbi->layout.s_pid, ret); | 884 | opts->dev_name, sbi->one_comp.obj.partition, ret); |
807 | exofs_free_sbi(sbi); | 885 | exofs_free_sbi(sbi); |
808 | if (opts->is_osdname) | ||
809 | kfree(opts->dev_name); | ||
810 | return ret; | 886 | return ret; |
811 | } | 887 | } |
812 | 888 | ||
@@ -837,7 +913,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
837 | { | 913 | { |
838 | struct super_block *sb = dentry->d_sb; | 914 | struct super_block *sb = dentry->d_sb; |
839 | struct exofs_sb_info *sbi = sb->s_fs_info; | 915 | struct exofs_sb_info *sbi = sb->s_fs_info; |
840 | struct exofs_io_state *ios; | 916 | struct ore_io_state *ios; |
841 | struct osd_attr attrs[] = { | 917 | struct osd_attr attrs[] = { |
842 | ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, | 918 | ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, |
843 | OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), | 919 | OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), |
@@ -846,21 +922,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
846 | }; | 922 | }; |
847 | uint64_t capacity = ULLONG_MAX; | 923 | uint64_t capacity = ULLONG_MAX; |
848 | uint64_t used = ULLONG_MAX; | 924 | uint64_t used = ULLONG_MAX; |
849 | uint8_t cred_a[OSD_CAP_LEN]; | ||
850 | int ret; | 925 | int ret; |
851 | 926 | ||
852 | ret = exofs_get_io_state(&sbi->layout, &ios); | 927 | ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios); |
853 | if (ret) { | 928 | if (ret) { |
854 | EXOFS_DBGMSG("exofs_get_io_state failed.\n"); | 929 | EXOFS_DBGMSG("ore_get_io_state failed.\n"); |
855 | return ret; | 930 | return ret; |
856 | } | 931 | } |
857 | 932 | ||
858 | exofs_make_credential(cred_a, &ios->obj); | ||
859 | ios->cred = sbi->s_cred; | ||
860 | ios->in_attr = attrs; | 933 | ios->in_attr = attrs; |
861 | ios->in_attr_len = ARRAY_SIZE(attrs); | 934 | ios->in_attr_len = ARRAY_SIZE(attrs); |
862 | 935 | ||
863 | ret = exofs_sbi_read(ios); | 936 | ret = ore_read(ios); |
864 | if (unlikely(ret)) | 937 | if (unlikely(ret)) |
865 | goto out; | 938 | goto out; |
866 | 939 | ||
@@ -889,7 +962,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
889 | buf->f_namelen = EXOFS_NAME_LEN; | 962 | buf->f_namelen = EXOFS_NAME_LEN; |
890 | 963 | ||
891 | out: | 964 | out: |
892 | exofs_put_io_state(ios); | 965 | ore_put_io_state(ios); |
893 | return ret; | 966 | return ret; |
894 | } | 967 | } |
895 | 968 | ||
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 52c053763942..35d6a3cfd9ff 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c | |||
@@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
194 | case ACL_TYPE_ACCESS: | 194 | case ACL_TYPE_ACCESS: |
195 | name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; | 195 | name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; |
196 | if (acl) { | 196 | if (acl) { |
197 | mode_t mode = inode->i_mode; | 197 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
198 | error = posix_acl_equiv_mode(acl, &mode); | ||
199 | if (error < 0) | 198 | if (error < 0) |
200 | return error; | 199 | return error; |
201 | else { | 200 | else { |
202 | inode->i_mode = mode; | ||
203 | inode->i_ctime = CURRENT_TIME_SEC; | 201 | inode->i_ctime = CURRENT_TIME_SEC; |
204 | mark_inode_dirty(inode); | 202 | mark_inode_dirty(inode); |
205 | if (error == 0) | 203 | if (error == 0) |
@@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir) | |||
253 | inode->i_mode &= ~current_umask(); | 251 | inode->i_mode &= ~current_umask(); |
254 | } | 252 | } |
255 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { | 253 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { |
256 | mode_t mode = inode->i_mode; | ||
257 | if (S_ISDIR(inode->i_mode)) { | 254 | if (S_ISDIR(inode->i_mode)) { |
258 | error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); | 255 | error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); |
259 | if (error) | 256 | if (error) |
260 | goto cleanup; | 257 | goto cleanup; |
261 | } | 258 | } |
262 | error = posix_acl_create(&acl, GFP_KERNEL, &mode); | 259 | error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); |
263 | if (error < 0) | 260 | if (error < 0) |
264 | return error; | 261 | return error; |
265 | inode->i_mode = mode; | ||
266 | if (error > 0) { | 262 | if (error > 0) { |
267 | /* This is an extended ACL */ | 263 | /* This is an extended ACL */ |
268 | error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); | 264 | error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); |
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 6c29bf0df04a..3091f62e55b6 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c | |||
@@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, | |||
199 | case ACL_TYPE_ACCESS: | 199 | case ACL_TYPE_ACCESS: |
200 | name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; | 200 | name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; |
201 | if (acl) { | 201 | if (acl) { |
202 | mode_t mode = inode->i_mode; | 202 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
203 | error = posix_acl_equiv_mode(acl, &mode); | ||
204 | if (error < 0) | 203 | if (error < 0) |
205 | return error; | 204 | return error; |
206 | else { | 205 | else { |
207 | inode->i_mode = mode; | ||
208 | inode->i_ctime = CURRENT_TIME_SEC; | 206 | inode->i_ctime = CURRENT_TIME_SEC; |
209 | ext3_mark_inode_dirty(handle, inode); | 207 | ext3_mark_inode_dirty(handle, inode); |
210 | if (error == 0) | 208 | if (error == 0) |
@@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
261 | inode->i_mode &= ~current_umask(); | 259 | inode->i_mode &= ~current_umask(); |
262 | } | 260 | } |
263 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { | 261 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { |
264 | mode_t mode = inode->i_mode; | ||
265 | |||
266 | if (S_ISDIR(inode->i_mode)) { | 262 | if (S_ISDIR(inode->i_mode)) { |
267 | error = ext3_set_acl(handle, inode, | 263 | error = ext3_set_acl(handle, inode, |
268 | ACL_TYPE_DEFAULT, acl); | 264 | ACL_TYPE_DEFAULT, acl); |
269 | if (error) | 265 | if (error) |
270 | goto cleanup; | 266 | goto cleanup; |
271 | } | 267 | } |
272 | error = posix_acl_create(&acl, GFP_NOFS, &mode); | 268 | error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
273 | if (error < 0) | 269 | if (error < 0) |
274 | return error; | 270 | return error; |
275 | 271 | ||
276 | inode->i_mode = mode; | ||
277 | if (error > 0) { | 272 | if (error > 0) { |
278 | /* This is an extended ACL */ | 273 | /* This is an extended ACL */ |
279 | error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); | 274 | error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); |
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 04109460ba9e..56fd8f865930 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile | |||
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o | |||
7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ | 7 | ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ |
8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ | 8 | ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ |
9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ | 9 | ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ |
10 | mmp.o | 10 | mmp.o indirect.o |
11 | 11 | ||
12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o | 12 | ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o |
13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o | 13 | ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o |
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dca2d1ded931..a5c29bb3b835 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c | |||
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |||
198 | case ACL_TYPE_ACCESS: | 198 | case ACL_TYPE_ACCESS: |
199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | 199 | name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; |
200 | if (acl) { | 200 | if (acl) { |
201 | mode_t mode = inode->i_mode; | 201 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
202 | error = posix_acl_equiv_mode(acl, &mode); | ||
203 | if (error < 0) | 202 | if (error < 0) |
204 | return error; | 203 | return error; |
205 | else { | 204 | else { |
206 | inode->i_mode = mode; | ||
207 | inode->i_ctime = ext4_current_time(inode); | 205 | inode->i_ctime = ext4_current_time(inode); |
208 | ext4_mark_inode_dirty(handle, inode); | 206 | ext4_mark_inode_dirty(handle, inode); |
209 | if (error == 0) | 207 | if (error == 0) |
@@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) | |||
259 | inode->i_mode &= ~current_umask(); | 257 | inode->i_mode &= ~current_umask(); |
260 | } | 258 | } |
261 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { | 259 | if (test_opt(inode->i_sb, POSIX_ACL) && acl) { |
262 | mode_t mode = inode->i_mode; | ||
263 | |||
264 | if (S_ISDIR(inode->i_mode)) { | 260 | if (S_ISDIR(inode->i_mode)) { |
265 | error = ext4_set_acl(handle, inode, | 261 | error = ext4_set_acl(handle, inode, |
266 | ACL_TYPE_DEFAULT, acl); | 262 | ACL_TYPE_DEFAULT, acl); |
267 | if (error) | 263 | if (error) |
268 | goto cleanup; | 264 | goto cleanup; |
269 | } | 265 | } |
270 | error = posix_acl_create(&acl, GFP_NOFS, &mode); | 266 | error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
271 | if (error < 0) | 267 | if (error < 0) |
272 | return error; | 268 | return error; |
273 | 269 | ||
274 | inode->i_mode = mode; | ||
275 | if (error > 0) { | 270 | if (error > 0) { |
276 | /* This is an extended ACL */ | 271 | /* This is an extended ACL */ |
277 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); | 272 | error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 264f6949511e..f8224adf496e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) | |||
620 | 620 | ||
621 | } | 621 | } |
622 | 622 | ||
623 | /** | ||
624 | * ext4_inode_to_goal_block - return a hint for block allocation | ||
625 | * @inode: inode for block allocation | ||
626 | * | ||
627 | * Return the ideal location to start allocating blocks for a | ||
628 | * newly created inode. | ||
629 | */ | ||
630 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) | ||
631 | { | ||
632 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
633 | ext4_group_t block_group; | ||
634 | ext4_grpblk_t colour; | ||
635 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
636 | ext4_fsblk_t bg_start; | ||
637 | ext4_fsblk_t last_block; | ||
638 | |||
639 | block_group = ei->i_block_group; | ||
640 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
641 | /* | ||
642 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
643 | * block groups per flexgroup, reserve the first block | ||
644 | * group for directories and special files. Regular | ||
645 | * files will start at the second block group. This | ||
646 | * tends to speed up directory access and improves | ||
647 | * fsck times. | ||
648 | */ | ||
649 | block_group &= ~(flex_size-1); | ||
650 | if (S_ISREG(inode->i_mode)) | ||
651 | block_group++; | ||
652 | } | ||
653 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
654 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
655 | |||
656 | /* | ||
657 | * If we are doing delayed allocation, we don't need take | ||
658 | * colour into account. | ||
659 | */ | ||
660 | if (test_opt(inode->i_sb, DELALLOC)) | ||
661 | return bg_start; | ||
662 | |||
663 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
664 | colour = (current->pid % 16) * | ||
665 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
666 | else | ||
667 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
668 | return bg_start + colour; | ||
669 | } | ||
670 | |||
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index fac90f3fba80..8efb2f0a3447 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c | |||
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, | |||
246 | return 1; | 246 | return 1; |
247 | } | 247 | } |
248 | 248 | ||
249 | int ext4_check_blockref(const char *function, unsigned int line, | ||
250 | struct inode *inode, __le32 *p, unsigned int max) | ||
251 | { | ||
252 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
253 | __le32 *bref = p; | ||
254 | unsigned int blk; | ||
255 | |||
256 | while (bref < p+max) { | ||
257 | blk = le32_to_cpu(*bref++); | ||
258 | if (blk && | ||
259 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
260 | blk, 1))) { | ||
261 | es->s_last_error_block = cpu_to_le64(blk); | ||
262 | ext4_error_inode(inode, function, line, blk, | ||
263 | "invalid block"); | ||
264 | return -EIO; | ||
265 | } | ||
266 | } | ||
267 | return 0; | ||
268 | } | ||
269 | |||
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fa44df879711..e717dfd2f2b4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -526,6 +526,7 @@ struct ext4_new_group_data { | |||
526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 | 526 | #define EXT4_FREE_BLOCKS_METADATA 0x0001 |
527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 | 527 | #define EXT4_FREE_BLOCKS_FORGET 0x0002 |
528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 | 528 | #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 |
529 | #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 | ||
529 | 530 | ||
530 | /* | 531 | /* |
531 | * ioctl commands | 532 | * ioctl commands |
@@ -939,6 +940,8 @@ struct ext4_inode_info { | |||
939 | #define ext4_find_next_zero_bit find_next_zero_bit_le | 940 | #define ext4_find_next_zero_bit find_next_zero_bit_le |
940 | #define ext4_find_next_bit find_next_bit_le | 941 | #define ext4_find_next_bit find_next_bit_le |
941 | 942 | ||
943 | extern void ext4_set_bits(void *bm, int cur, int len); | ||
944 | |||
942 | /* | 945 | /* |
943 | * Maximal mount counts between two filesystem checks | 946 | * Maximal mount counts between two filesystem checks |
944 | */ | 947 | */ |
@@ -1126,7 +1129,8 @@ struct ext4_sb_info { | |||
1126 | struct journal_s *s_journal; | 1129 | struct journal_s *s_journal; |
1127 | struct list_head s_orphan; | 1130 | struct list_head s_orphan; |
1128 | struct mutex s_orphan_lock; | 1131 | struct mutex s_orphan_lock; |
1129 | struct mutex s_resize_lock; | 1132 | unsigned long s_resize_flags; /* Flags indicating if there |
1133 | is a resizer */ | ||
1130 | unsigned long s_commit_interval; | 1134 | unsigned long s_commit_interval; |
1131 | u32 s_max_batch_time; | 1135 | u32 s_max_batch_time; |
1132 | u32 s_min_batch_time; | 1136 | u32 s_min_batch_time; |
@@ -1214,6 +1218,9 @@ struct ext4_sb_info { | |||
1214 | 1218 | ||
1215 | /* Kernel thread for multiple mount protection */ | 1219 | /* Kernel thread for multiple mount protection */ |
1216 | struct task_struct *s_mmp_tsk; | 1220 | struct task_struct *s_mmp_tsk; |
1221 | |||
1222 | /* record the last minlen when FITRIM is called. */ | ||
1223 | atomic_t s_last_trim_minblks; | ||
1217 | }; | 1224 | }; |
1218 | 1225 | ||
1219 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) | 1226 | static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) |
@@ -1743,6 +1750,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, | |||
1743 | struct ext4_group_desc *desc); | 1750 | struct ext4_group_desc *desc); |
1744 | #define ext4_free_blocks_after_init(sb, group, desc) \ | 1751 | #define ext4_free_blocks_after_init(sb, group, desc) \ |
1745 | ext4_init_block_bitmap(sb, NULL, group, desc) | 1752 | ext4_init_block_bitmap(sb, NULL, group, desc) |
1753 | ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); | ||
1746 | 1754 | ||
1747 | /* dir.c */ | 1755 | /* dir.c */ |
1748 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, | 1756 | extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, |
@@ -1793,7 +1801,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode, | |||
1793 | unsigned long count, int flags); | 1801 | unsigned long count, int flags); |
1794 | extern int ext4_mb_add_groupinfo(struct super_block *sb, | 1802 | extern int ext4_mb_add_groupinfo(struct super_block *sb, |
1795 | ext4_group_t i, struct ext4_group_desc *desc); | 1803 | ext4_group_t i, struct ext4_group_desc *desc); |
1796 | extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 1804 | extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
1797 | ext4_fsblk_t block, unsigned long count); | 1805 | ext4_fsblk_t block, unsigned long count); |
1798 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); | 1806 | extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); |
1799 | 1807 | ||
@@ -1834,6 +1842,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | |||
1834 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); | 1842 | extern qsize_t *ext4_get_reserved_space(struct inode *inode); |
1835 | extern void ext4_da_update_reserve_space(struct inode *inode, | 1843 | extern void ext4_da_update_reserve_space(struct inode *inode, |
1836 | int used, int quota_claim); | 1844 | int used, int quota_claim); |
1845 | |||
1846 | /* indirect.c */ | ||
1847 | extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
1848 | struct ext4_map_blocks *map, int flags); | ||
1849 | extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
1850 | const struct iovec *iov, loff_t offset, | ||
1851 | unsigned long nr_segs); | ||
1852 | extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); | ||
1853 | extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); | ||
1854 | extern void ext4_ind_truncate(struct inode *inode); | ||
1855 | |||
1837 | /* ioctl.c */ | 1856 | /* ioctl.c */ |
1838 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 1857 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
1839 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); | 1858 | extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
@@ -1855,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb, | |||
1855 | ext4_fsblk_t n_blocks_count); | 1874 | ext4_fsblk_t n_blocks_count); |
1856 | 1875 | ||
1857 | /* super.c */ | 1876 | /* super.c */ |
1877 | extern void *ext4_kvmalloc(size_t size, gfp_t flags); | ||
1878 | extern void *ext4_kvzalloc(size_t size, gfp_t flags); | ||
1879 | extern void ext4_kvfree(void *ptr); | ||
1858 | extern void __ext4_error(struct super_block *, const char *, unsigned int, | 1880 | extern void __ext4_error(struct super_block *, const char *, unsigned int, |
1859 | const char *, ...) | 1881 | const char *, ...) |
1860 | __attribute__ ((format (printf, 4, 5))); | 1882 | __attribute__ ((format (printf, 4, 5))); |
@@ -2067,11 +2089,19 @@ struct ext4_group_info { | |||
2067 | * 5 free 8-block regions. */ | 2089 | * 5 free 8-block regions. */ |
2068 | }; | 2090 | }; |
2069 | 2091 | ||
2070 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 | 2092 | #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 |
2093 | #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 | ||
2071 | 2094 | ||
2072 | #define EXT4_MB_GRP_NEED_INIT(grp) \ | 2095 | #define EXT4_MB_GRP_NEED_INIT(grp) \ |
2073 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) | 2096 | (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) |
2074 | 2097 | ||
2098 | #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ | ||
2099 | (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2100 | #define EXT4_MB_GRP_SET_TRIMMED(grp) \ | ||
2101 | (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2102 | #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ | ||
2103 | (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) | ||
2104 | |||
2075 | #define EXT4_MAX_CONTENTION 8 | 2105 | #define EXT4_MAX_CONTENTION 8 |
2076 | #define EXT4_CONTENTION_THRESHOLD 2 | 2106 | #define EXT4_CONTENTION_THRESHOLD 2 |
2077 | 2107 | ||
@@ -2123,6 +2153,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb) | |||
2123 | } | 2153 | } |
2124 | 2154 | ||
2125 | /* | 2155 | /* |
2156 | * Block validity checking | ||
2157 | */ | ||
2158 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
2159 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2160 | (__le32 *)(bh)->b_data, \ | ||
2161 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
2162 | |||
2163 | #define ext4_ind_check_inode(inode) \ | ||
2164 | ext4_check_blockref(__func__, __LINE__, inode, \ | ||
2165 | EXT4_I(inode)->i_data, \ | ||
2166 | EXT4_NDIR_BLOCKS) | ||
2167 | |||
2168 | /* | ||
2126 | * Inodes and files operations | 2169 | * Inodes and files operations |
2127 | */ | 2170 | */ |
2128 | 2171 | ||
@@ -2151,6 +2194,8 @@ extern void ext4_exit_system_zone(void); | |||
2151 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, | 2194 | extern int ext4_data_block_valid(struct ext4_sb_info *sbi, |
2152 | ext4_fsblk_t start_blk, | 2195 | ext4_fsblk_t start_blk, |
2153 | unsigned int count); | 2196 | unsigned int count); |
2197 | extern int ext4_check_blockref(const char *, unsigned int, | ||
2198 | struct inode *, __le32 *, unsigned int); | ||
2154 | 2199 | ||
2155 | /* extents.c */ | 2200 | /* extents.c */ |
2156 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | 2201 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); |
@@ -2230,6 +2275,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) | |||
2230 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; | 2275 | extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; |
2231 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; | 2276 | extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; |
2232 | 2277 | ||
2278 | #define EXT4_RESIZING 0 | ||
2279 | extern int ext4_resize_begin(struct super_block *sb); | ||
2280 | extern void ext4_resize_end(struct super_block *sb); | ||
2281 | |||
2233 | #endif /* __KERNEL__ */ | 2282 | #endif /* __KERNEL__ */ |
2234 | 2283 | ||
2235 | #endif /* _EXT4_H */ | 2284 | #endif /* _EXT4_H */ |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f815cc81e7a2..57cf568a98ab 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
114 | struct ext4_ext_path *path, | 114 | struct ext4_ext_path *path, |
115 | ext4_lblk_t block) | 115 | ext4_lblk_t block) |
116 | { | 116 | { |
117 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
118 | ext4_fsblk_t bg_start; | ||
119 | ext4_fsblk_t last_block; | ||
120 | ext4_grpblk_t colour; | ||
121 | ext4_group_t block_group; | ||
122 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
123 | int depth; | 117 | int depth; |
124 | 118 | ||
125 | if (path) { | 119 | if (path) { |
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
161 | } | 155 | } |
162 | 156 | ||
163 | /* OK. use inode's group */ | 157 | /* OK. use inode's group */ |
164 | block_group = ei->i_block_group; | 158 | return ext4_inode_to_goal_block(inode); |
165 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
166 | /* | ||
167 | * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME | ||
168 | * block groups per flexgroup, reserve the first block | ||
169 | * group for directories and special files. Regular | ||
170 | * files will start at the second block group. This | ||
171 | * tends to speed up directory access and improves | ||
172 | * fsck times. | ||
173 | */ | ||
174 | block_group &= ~(flex_size-1); | ||
175 | if (S_ISREG(inode->i_mode)) | ||
176 | block_group++; | ||
177 | } | ||
178 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
179 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
180 | |||
181 | /* | ||
182 | * If we are doing delayed allocation, we don't need take | ||
183 | * colour into account. | ||
184 | */ | ||
185 | if (test_opt(inode->i_sb, DELALLOC)) | ||
186 | return bg_start; | ||
187 | |||
188 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
189 | colour = (current->pid % 16) * | ||
190 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
191 | else | ||
192 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
193 | return bg_start + colour + block; | ||
194 | } | 159 | } |
195 | 160 | ||
196 | /* | 161 | /* |
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
776 | logical, le32_to_cpu(curp->p_idx->ei_block)); | 741 | logical, le32_to_cpu(curp->p_idx->ei_block)); |
777 | return -EIO; | 742 | return -EIO; |
778 | } | 743 | } |
744 | |||
745 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
746 | >= le16_to_cpu(curp->p_hdr->eh_max))) { | ||
747 | EXT4_ERROR_INODE(inode, | ||
748 | "eh_entries %d >= eh_max %d!", | ||
749 | le16_to_cpu(curp->p_hdr->eh_entries), | ||
750 | le16_to_cpu(curp->p_hdr->eh_max)); | ||
751 | return -EIO; | ||
752 | } | ||
753 | |||
779 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; | 754 | len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; |
780 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { | 755 | if (logical > le32_to_cpu(curp->p_idx->ei_block)) { |
781 | /* insert after */ | 756 | /* insert after */ |
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, | |||
805 | ext4_idx_store_pblock(ix, ptr); | 780 | ext4_idx_store_pblock(ix, ptr); |
806 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); | 781 | le16_add_cpu(&curp->p_hdr->eh_entries, 1); |
807 | 782 | ||
808 | if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries) | ||
809 | > le16_to_cpu(curp->p_hdr->eh_max))) { | ||
810 | EXT4_ERROR_INODE(inode, | ||
811 | "logical %d == ei_block %d!", | ||
812 | logical, le32_to_cpu(curp->p_idx->ei_block)); | ||
813 | return -EIO; | ||
814 | } | ||
815 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { | 783 | if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { |
816 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); | 784 | EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); |
817 | return -EIO; | 785 | return -EIO; |
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path) | |||
1446 | * ext4_ext_next_leaf_block: | 1414 | * ext4_ext_next_leaf_block: |
1447 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS | 1415 | * returns first allocated block from next leaf or EXT_MAX_BLOCKS |
1448 | */ | 1416 | */ |
1449 | static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, | 1417 | static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path) |
1450 | struct ext4_ext_path *path) | ||
1451 | { | 1418 | { |
1452 | int depth; | 1419 | int depth; |
1453 | 1420 | ||
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, | |||
1757 | goto merge; | 1724 | goto merge; |
1758 | } | 1725 | } |
1759 | 1726 | ||
1760 | repeat: | ||
1761 | depth = ext_depth(inode); | 1727 | depth = ext_depth(inode); |
1762 | eh = path[depth].p_hdr; | 1728 | eh = path[depth].p_hdr; |
1763 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) | 1729 | if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) |
@@ -1765,9 +1731,10 @@ repeat: | |||
1765 | 1731 | ||
1766 | /* probably next leaf has space for us? */ | 1732 | /* probably next leaf has space for us? */ |
1767 | fex = EXT_LAST_EXTENT(eh); | 1733 | fex = EXT_LAST_EXTENT(eh); |
1768 | next = ext4_ext_next_leaf_block(inode, path); | 1734 | next = EXT_MAX_BLOCKS; |
1769 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) | 1735 | if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) |
1770 | && next != EXT_MAX_BLOCKS) { | 1736 | next = ext4_ext_next_leaf_block(path); |
1737 | if (next != EXT_MAX_BLOCKS) { | ||
1771 | ext_debug("next leaf block - %d\n", next); | 1738 | ext_debug("next leaf block - %d\n", next); |
1772 | BUG_ON(npath != NULL); | 1739 | BUG_ON(npath != NULL); |
1773 | npath = ext4_ext_find_extent(inode, next, NULL); | 1740 | npath = ext4_ext_find_extent(inode, next, NULL); |
@@ -1779,7 +1746,7 @@ repeat: | |||
1779 | ext_debug("next leaf isn't full(%d)\n", | 1746 | ext_debug("next leaf isn't full(%d)\n", |
1780 | le16_to_cpu(eh->eh_entries)); | 1747 | le16_to_cpu(eh->eh_entries)); |
1781 | path = npath; | 1748 | path = npath; |
1782 | goto repeat; | 1749 | goto has_space; |
1783 | } | 1750 | } |
1784 | ext_debug("next leaf has no free space(%d,%d)\n", | 1751 | ext_debug("next leaf has no free space(%d,%d)\n", |
1785 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); | 1752 | le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); |
@@ -1839,7 +1806,7 @@ has_space: | |||
1839 | ext4_ext_pblock(newext), | 1806 | ext4_ext_pblock(newext), |
1840 | ext4_ext_is_uninitialized(newext), | 1807 | ext4_ext_is_uninitialized(newext), |
1841 | ext4_ext_get_actual_len(newext), | 1808 | ext4_ext_get_actual_len(newext), |
1842 | nearex, len, nearex + 1, nearex + 2); | 1809 | nearex, len, nearex, nearex + 1); |
1843 | memmove(nearex + 1, nearex, len); | 1810 | memmove(nearex + 1, nearex, len); |
1844 | path[depth].p_ext = nearex; | 1811 | path[depth].p_ext = nearex; |
1845 | } | 1812 | } |
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, | |||
2052 | } | 2019 | } |
2053 | 2020 | ||
2054 | /* | 2021 | /* |
2055 | * ext4_ext_in_cache() | 2022 | * ext4_ext_check_cache() |
2056 | * Checks to see if the given block is in the cache. | 2023 | * Checks to see if the given block is in the cache. |
2057 | * If it is, the cached extent is stored in the given | 2024 | * If it is, the cached extent is stored in the given |
2058 | * cache extent pointer. If the cached extent is a hole, | 2025 | * cache extent pointer. If the cached extent is a hole, |
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, | |||
2134 | /* | 2101 | /* |
2135 | * ext4_ext_rm_idx: | 2102 | * ext4_ext_rm_idx: |
2136 | * removes index from the index block. | 2103 | * removes index from the index block. |
2137 | * It's used in truncate case only, thus all requests are for | ||
2138 | * last index in the block only. | ||
2139 | */ | 2104 | */ |
2140 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | 2105 | static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, |
2141 | struct ext4_ext_path *path) | 2106 | struct ext4_ext_path *path) |
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |||
2153 | err = ext4_ext_get_access(handle, inode, path); | 2118 | err = ext4_ext_get_access(handle, inode, path); |
2154 | if (err) | 2119 | if (err) |
2155 | return err; | 2120 | return err; |
2121 | |||
2122 | if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) { | ||
2123 | int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx; | ||
2124 | len *= sizeof(struct ext4_extent_idx); | ||
2125 | memmove(path->p_idx, path->p_idx + 1, len); | ||
2126 | } | ||
2127 | |||
2156 | le16_add_cpu(&path->p_hdr->eh_entries, -1); | 2128 | le16_add_cpu(&path->p_hdr->eh_entries, -1); |
2157 | err = ext4_ext_dirty(handle, inode, path); | 2129 | err = ext4_ext_dirty(handle, inode, path); |
2158 | if (err) | 2130 | if (err) |
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) | |||
2534 | return 1; | 2506 | return 1; |
2535 | } | 2507 | } |
2536 | 2508 | ||
2537 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, | 2509 | static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) |
2538 | ext4_lblk_t end) | ||
2539 | { | 2510 | { |
2540 | struct super_block *sb = inode->i_sb; | 2511 | struct super_block *sb = inode->i_sb; |
2541 | int depth = ext_depth(inode); | 2512 | int depth = ext_depth(inode); |
@@ -2575,7 +2546,7 @@ again: | |||
2575 | if (i == depth) { | 2546 | if (i == depth) { |
2576 | /* this is leaf block */ | 2547 | /* this is leaf block */ |
2577 | err = ext4_ext_rm_leaf(handle, inode, path, | 2548 | err = ext4_ext_rm_leaf(handle, inode, path, |
2578 | start, end); | 2549 | start, EXT_MAX_BLOCKS - 1); |
2579 | /* root level has p_bh == NULL, brelse() eats this */ | 2550 | /* root level has p_bh == NULL, brelse() eats this */ |
2580 | brelse(path[i].p_bh); | 2551 | brelse(path[i].p_bh); |
2581 | path[i].p_bh = NULL; | 2552 | path[i].p_bh = NULL; |
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, | |||
3107 | struct ext4_ext_path *path) | 3078 | struct ext4_ext_path *path) |
3108 | { | 3079 | { |
3109 | struct ext4_extent *ex; | 3080 | struct ext4_extent *ex; |
3110 | struct ext4_extent_header *eh; | ||
3111 | int depth; | 3081 | int depth; |
3112 | int err = 0; | 3082 | int err = 0; |
3113 | 3083 | ||
3114 | depth = ext_depth(inode); | 3084 | depth = ext_depth(inode); |
3115 | eh = path[depth].p_hdr; | ||
3116 | ex = path[depth].p_ext; | 3085 | ex = path[depth].p_ext; |
3117 | 3086 | ||
3118 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" | 3087 | ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" |
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3357 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | 3326 | trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); |
3358 | 3327 | ||
3359 | /* check in cache */ | 3328 | /* check in cache */ |
3360 | if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && | 3329 | if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) && |
3361 | ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { | 3330 | ext4_ext_in_cache(inode, map->m_lblk, &newex)) { |
3362 | if (!newex.ee_start_lo && !newex.ee_start_hi) { | 3331 | if (!newex.ee_start_lo && !newex.ee_start_hi) { |
3363 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { | 3332 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { |
3364 | /* | 3333 | /* |
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3497 | 3466 | ||
3498 | ext4_ext_mark_uninitialized(ex); | 3467 | ext4_ext_mark_uninitialized(ex); |
3499 | 3468 | ||
3500 | err = ext4_ext_remove_space(inode, map->m_lblk, | 3469 | ext4_ext_invalidate_cache(inode); |
3501 | map->m_lblk + punched_out); | 3470 | |
3471 | err = ext4_ext_rm_leaf(handle, inode, path, | ||
3472 | map->m_lblk, map->m_lblk + punched_out); | ||
3473 | |||
3474 | if (!err && path->p_hdr->eh_entries == 0) { | ||
3475 | /* | ||
3476 | * Punch hole freed all of this sub tree, | ||
3477 | * so we need to correct eh_depth | ||
3478 | */ | ||
3479 | err = ext4_ext_get_access(handle, inode, path); | ||
3480 | if (err == 0) { | ||
3481 | ext_inode_hdr(inode)->eh_depth = 0; | ||
3482 | ext_inode_hdr(inode)->eh_max = | ||
3483 | cpu_to_le16(ext4_ext_space_root( | ||
3484 | inode, 0)); | ||
3485 | |||
3486 | err = ext4_ext_dirty( | ||
3487 | handle, inode, path); | ||
3488 | } | ||
3489 | } | ||
3502 | 3490 | ||
3503 | goto out2; | 3491 | goto out2; |
3504 | } | 3492 | } |
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, | |||
3596 | } | 3584 | } |
3597 | 3585 | ||
3598 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); | 3586 | err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); |
3599 | if (err) | 3587 | if (!err) |
3600 | goto out2; | 3588 | err = ext4_ext_insert_extent(handle, inode, path, |
3601 | 3589 | &newex, flags); | |
3602 | err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); | ||
3603 | if (err) { | 3590 | if (err) { |
3591 | int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? | ||
3592 | EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; | ||
3604 | /* free data blocks we just allocated */ | 3593 | /* free data blocks we just allocated */ |
3605 | /* not a good idea to call discard here directly, | 3594 | /* not a good idea to call discard here directly, |
3606 | * but otherwise we'd need to call it every free() */ | 3595 | * but otherwise we'd need to call it every free() */ |
3607 | ext4_discard_preallocations(inode); | 3596 | ext4_discard_preallocations(inode); |
3608 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), | 3597 | ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), |
3609 | ext4_ext_get_actual_len(&newex), 0); | 3598 | ext4_ext_get_actual_len(&newex), fb_flags); |
3610 | goto out2; | 3599 | goto out2; |
3611 | } | 3600 | } |
3612 | 3601 | ||
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode) | |||
3699 | 3688 | ||
3700 | last_block = (inode->i_size + sb->s_blocksize - 1) | 3689 | last_block = (inode->i_size + sb->s_blocksize - 1) |
3701 | >> EXT4_BLOCK_SIZE_BITS(sb); | 3690 | >> EXT4_BLOCK_SIZE_BITS(sb); |
3702 | err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); | 3691 | err = ext4_ext_remove_space(inode, last_block); |
3703 | 3692 | ||
3704 | /* In a multi-transaction truncate, we only make the final | 3693 | /* In a multi-transaction truncate, we only make the final |
3705 | * transaction synchronous. | 3694 | * transaction synchronous. |
@@ -3835,7 +3824,7 @@ retry: | |||
3835 | blkbits) >> blkbits)) | 3824 | blkbits) >> blkbits)) |
3836 | new_size = offset + len; | 3825 | new_size = offset + len; |
3837 | else | 3826 | else |
3838 | new_size = (map.m_lblk + ret) << blkbits; | 3827 | new_size = ((loff_t) map.m_lblk + ret) << blkbits; |
3839 | 3828 | ||
3840 | ext4_falloc_update_inode(inode, mode, new_size, | 3829 | ext4_falloc_update_inode(inode, mode, new_size, |
3841 | (map.m_flags & EXT4_MAP_NEW)); | 3830 | (map.m_flags & EXT4_MAP_NEW)); |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index da3bed3e0c29..036f78f7a1ef 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode) | |||
129 | { | 129 | { |
130 | struct writeback_control wbc; | 130 | struct writeback_control wbc; |
131 | struct dentry *dentry = NULL; | 131 | struct dentry *dentry = NULL; |
132 | struct inode *next; | ||
132 | int ret = 0; | 133 | int ret = 0; |
133 | 134 | ||
134 | while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | 135 | if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) |
136 | return 0; | ||
137 | inode = igrab(inode); | ||
138 | while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { | ||
135 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); | 139 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); |
136 | dentry = list_entry(inode->i_dentry.next, | 140 | dentry = NULL; |
137 | struct dentry, d_alias); | 141 | spin_lock(&inode->i_lock); |
138 | if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) | 142 | if (!list_empty(&inode->i_dentry)) { |
143 | dentry = list_first_entry(&inode->i_dentry, | ||
144 | struct dentry, d_alias); | ||
145 | dget(dentry); | ||
146 | } | ||
147 | spin_unlock(&inode->i_lock); | ||
148 | if (!dentry) | ||
139 | break; | 149 | break; |
140 | inode = dentry->d_parent->d_inode; | 150 | next = igrab(dentry->d_parent->d_inode); |
151 | dput(dentry); | ||
152 | if (!next) | ||
153 | break; | ||
154 | iput(inode); | ||
155 | inode = next; | ||
141 | ret = sync_mapping_buffers(inode->i_mapping); | 156 | ret = sync_mapping_buffers(inode->i_mapping); |
142 | if (ret) | 157 | if (ret) |
143 | break; | 158 | break; |
@@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode) | |||
148 | if (ret) | 163 | if (ret) |
149 | break; | 164 | break; |
150 | } | 165 | } |
166 | iput(inode); | ||
151 | return ret; | 167 | return ret; |
152 | } | 168 | } |
153 | 169 | ||
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21bb2f61e502..9c63f273b550 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, | |||
1287 | group, used_blks, | 1287 | group, used_blks, |
1288 | ext4_itable_unused_count(sb, gdp)); | 1288 | ext4_itable_unused_count(sb, gdp)); |
1289 | ret = 1; | 1289 | ret = 1; |
1290 | goto out; | 1290 | goto err_out; |
1291 | } | 1291 | } |
1292 | 1292 | ||
1293 | blk = ext4_inode_table(sb, gdp) + used_blks; | 1293 | blk = ext4_inode_table(sb, gdp) + used_blks; |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c new file mode 100644 index 000000000000..b8602cde5b5a --- /dev/null +++ b/fs/ext4/indirect.c | |||
@@ -0,0 +1,1482 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/indirect.c | ||
3 | * | ||
4 | * from | ||
5 | * | ||
6 | * linux/fs/ext4/inode.c | ||
7 | * | ||
8 | * Copyright (C) 1992, 1993, 1994, 1995 | ||
9 | * Remy Card (card@masi.ibp.fr) | ||
10 | * Laboratoire MASI - Institut Blaise Pascal | ||
11 | * Universite Pierre et Marie Curie (Paris VI) | ||
12 | * | ||
13 | * from | ||
14 | * | ||
15 | * linux/fs/minix/inode.c | ||
16 | * | ||
17 | * Copyright (C) 1991, 1992 Linus Torvalds | ||
18 | * | ||
19 | * Goal-directed block allocation by Stephen Tweedie | ||
20 | * (sct@redhat.com), 1993, 1998 | ||
21 | */ | ||
22 | |||
23 | #include <linux/module.h> | ||
24 | #include "ext4_jbd2.h" | ||
25 | #include "truncate.h" | ||
26 | |||
27 | #include <trace/events/ext4.h> | ||
28 | |||
29 | typedef struct { | ||
30 | __le32 *p; | ||
31 | __le32 key; | ||
32 | struct buffer_head *bh; | ||
33 | } Indirect; | ||
34 | |||
35 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
36 | { | ||
37 | p->key = *(p->p = v); | ||
38 | p->bh = bh; | ||
39 | } | ||
40 | |||
41 | /** | ||
42 | * ext4_block_to_path - parse the block number into array of offsets | ||
43 | * @inode: inode in question (we are only interested in its superblock) | ||
44 | * @i_block: block number to be parsed | ||
45 | * @offsets: array to store the offsets in | ||
46 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
47 | * followed (on disk) by an indirect block. | ||
48 | * | ||
49 | * To store the locations of file's data ext4 uses a data structure common | ||
50 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
51 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
52 | * This function translates the block number into path in that tree - | ||
53 | * return value is the path length and @offsets[n] is the offset of | ||
54 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
55 | * (negative or too large) warning is printed and zero returned. | ||
56 | * | ||
57 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
58 | * we need to know is the capacity of indirect blocks (taken from the | ||
59 | * inode->i_sb). | ||
60 | */ | ||
61 | |||
62 | /* | ||
63 | * Portability note: the last comparison (check that we fit into triple | ||
64 | * indirect block) is spelled differently, because otherwise on an | ||
65 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
66 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
67 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
68 | * i_block would have to be negative in the very beginning, so we would not | ||
69 | * get there at all. | ||
70 | */ | ||
71 | |||
72 | static int ext4_block_to_path(struct inode *inode, | ||
73 | ext4_lblk_t i_block, | ||
74 | ext4_lblk_t offsets[4], int *boundary) | ||
75 | { | ||
76 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
77 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
78 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
79 | indirect_blocks = ptrs, | ||
80 | double_blocks = (1 << (ptrs_bits * 2)); | ||
81 | int n = 0; | ||
82 | int final = 0; | ||
83 | |||
84 | if (i_block < direct_blocks) { | ||
85 | offsets[n++] = i_block; | ||
86 | final = direct_blocks; | ||
87 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
88 | offsets[n++] = EXT4_IND_BLOCK; | ||
89 | offsets[n++] = i_block; | ||
90 | final = ptrs; | ||
91 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
92 | offsets[n++] = EXT4_DIND_BLOCK; | ||
93 | offsets[n++] = i_block >> ptrs_bits; | ||
94 | offsets[n++] = i_block & (ptrs - 1); | ||
95 | final = ptrs; | ||
96 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
97 | offsets[n++] = EXT4_TIND_BLOCK; | ||
98 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
99 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
100 | offsets[n++] = i_block & (ptrs - 1); | ||
101 | final = ptrs; | ||
102 | } else { | ||
103 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
104 | i_block + direct_blocks + | ||
105 | indirect_blocks + double_blocks, inode->i_ino); | ||
106 | } | ||
107 | if (boundary) | ||
108 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
109 | return n; | ||
110 | } | ||
111 | |||
112 | /** | ||
113 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
114 | * @inode: inode in question | ||
115 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
116 | * @offsets: offsets of pointers in inode/indirect blocks | ||
117 | * @chain: place to store the result | ||
118 | * @err: here we store the error value | ||
119 | * | ||
120 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
121 | * if everything went OK or the pointer to the last filled triple | ||
122 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
123 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
124 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
125 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
126 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
127 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
128 | * numbers of the chain, addresses they were taken from (and where we can | ||
129 | * verify that chain did not change) and buffer_heads hosting these | ||
130 | * numbers. | ||
131 | * | ||
132 | * Function stops when it stumbles upon zero pointer (absent block) | ||
133 | * (pointer to last triple returned, *@err == 0) | ||
134 | * or when it gets an IO error reading an indirect block | ||
135 | * (ditto, *@err == -EIO) | ||
136 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
137 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
138 | * | ||
139 | * Need to be called with | ||
140 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
141 | */ | ||
142 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
143 | ext4_lblk_t *offsets, | ||
144 | Indirect chain[4], int *err) | ||
145 | { | ||
146 | struct super_block *sb = inode->i_sb; | ||
147 | Indirect *p = chain; | ||
148 | struct buffer_head *bh; | ||
149 | |||
150 | *err = 0; | ||
151 | /* i_data is not going away, no lock needed */ | ||
152 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
153 | if (!p->key) | ||
154 | goto no_block; | ||
155 | while (--depth) { | ||
156 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
157 | if (unlikely(!bh)) | ||
158 | goto failure; | ||
159 | |||
160 | if (!bh_uptodate_or_lock(bh)) { | ||
161 | if (bh_submit_read(bh) < 0) { | ||
162 | put_bh(bh); | ||
163 | goto failure; | ||
164 | } | ||
165 | /* validate block references */ | ||
166 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
167 | put_bh(bh); | ||
168 | goto failure; | ||
169 | } | ||
170 | } | ||
171 | |||
172 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
173 | /* Reader: end */ | ||
174 | if (!p->key) | ||
175 | goto no_block; | ||
176 | } | ||
177 | return NULL; | ||
178 | |||
179 | failure: | ||
180 | *err = -EIO; | ||
181 | no_block: | ||
182 | return p; | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * ext4_find_near - find a place for allocation with sufficient locality | ||
187 | * @inode: owner | ||
188 | * @ind: descriptor of indirect block. | ||
189 | * | ||
190 | * This function returns the preferred place for block allocation. | ||
191 | * It is used when heuristic for sequential allocation fails. | ||
192 | * Rules are: | ||
193 | * + if there is a block to the left of our position - allocate near it. | ||
194 | * + if pointer will live in indirect block - allocate near that block. | ||
195 | * + if pointer will live in inode - allocate in the same | ||
196 | * cylinder group. | ||
197 | * | ||
198 | * In the latter case we colour the starting block by the callers PID to | ||
199 | * prevent it from clashing with concurrent allocations for a different inode | ||
200 | * in the same block group. The PID is used here so that functionally related | ||
201 | * files will be close-by on-disk. | ||
202 | * | ||
203 | * Caller must make sure that @ind is valid and will stay that way. | ||
204 | */ | ||
205 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
206 | { | ||
207 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
208 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
209 | __le32 *p; | ||
210 | |||
211 | /* Try to find previous block */ | ||
212 | for (p = ind->p - 1; p >= start; p--) { | ||
213 | if (*p) | ||
214 | return le32_to_cpu(*p); | ||
215 | } | ||
216 | |||
217 | /* No such thing, so let's try location of indirect block */ | ||
218 | if (ind->bh) | ||
219 | return ind->bh->b_blocknr; | ||
220 | |||
221 | /* | ||
222 | * It is going to be referred to from the inode itself? OK, just put it | ||
223 | * into the same cylinder group then. | ||
224 | */ | ||
225 | return ext4_inode_to_goal_block(inode); | ||
226 | } | ||
227 | |||
228 | /** | ||
229 | * ext4_find_goal - find a preferred place for allocation. | ||
230 | * @inode: owner | ||
231 | * @block: block we want | ||
232 | * @partial: pointer to the last triple within a chain | ||
233 | * | ||
234 | * Normally this function find the preferred place for block allocation, | ||
235 | * returns it. | ||
236 | * Because this is only used for non-extent files, we limit the block nr | ||
237 | * to 32 bits. | ||
238 | */ | ||
239 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
240 | Indirect *partial) | ||
241 | { | ||
242 | ext4_fsblk_t goal; | ||
243 | |||
244 | /* | ||
245 | * XXX need to get goal block from mballoc's data structures | ||
246 | */ | ||
247 | |||
248 | goal = ext4_find_near(inode, partial); | ||
249 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
250 | return goal; | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
255 | * of direct blocks need to be allocated for the given branch. | ||
256 | * | ||
257 | * @branch: chain of indirect blocks | ||
258 | * @k: number of blocks need for indirect blocks | ||
259 | * @blks: number of data blocks to be mapped. | ||
260 | * @blocks_to_boundary: the offset in the indirect block | ||
261 | * | ||
262 | * return the total number of blocks to be allocate, including the | ||
263 | * direct and indirect blocks. | ||
264 | */ | ||
265 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
266 | int blocks_to_boundary) | ||
267 | { | ||
268 | unsigned int count = 0; | ||
269 | |||
270 | /* | ||
271 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
272 | * then it's clear blocks on that path have not allocated | ||
273 | */ | ||
274 | if (k > 0) { | ||
275 | /* right now we don't handle cross boundary allocation */ | ||
276 | if (blks < blocks_to_boundary + 1) | ||
277 | count += blks; | ||
278 | else | ||
279 | count += blocks_to_boundary + 1; | ||
280 | return count; | ||
281 | } | ||
282 | |||
283 | count++; | ||
284 | while (count < blks && count <= blocks_to_boundary && | ||
285 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
286 | count++; | ||
287 | } | ||
288 | return count; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
293 | * @handle: handle for this transaction | ||
294 | * @inode: inode which needs allocated blocks | ||
295 | * @iblock: the logical block to start allocated at | ||
296 | * @goal: preferred physical block of allocation | ||
297 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
298 | * blocks | ||
299 | * @blks: number of desired blocks | ||
300 | * @new_blocks: on return it will store the new block numbers for | ||
301 | * the indirect blocks(if needed) and the first direct block, | ||
302 | * @err: on return it will store the error code | ||
303 | * | ||
304 | * This function will return the number of blocks allocated as | ||
305 | * requested by the passed-in parameters. | ||
306 | */ | ||
307 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
308 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
309 | int indirect_blks, int blks, | ||
310 | ext4_fsblk_t new_blocks[4], int *err) | ||
311 | { | ||
312 | struct ext4_allocation_request ar; | ||
313 | int target, i; | ||
314 | unsigned long count = 0, blk_allocated = 0; | ||
315 | int index = 0; | ||
316 | ext4_fsblk_t current_block = 0; | ||
317 | int ret = 0; | ||
318 | |||
319 | /* | ||
320 | * Here we try to allocate the requested multiple blocks at once, | ||
321 | * on a best-effort basis. | ||
322 | * To build a branch, we should allocate blocks for | ||
323 | * the indirect blocks(if not allocated yet), and at least | ||
324 | * the first direct block of this branch. That's the | ||
325 | * minimum number of blocks need to allocate(required) | ||
326 | */ | ||
327 | /* first we try to allocate the indirect blocks */ | ||
328 | target = indirect_blks; | ||
329 | while (target > 0) { | ||
330 | count = target; | ||
331 | /* allocating blocks for indirect blocks and direct blocks */ | ||
332 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
333 | 0, &count, err); | ||
334 | if (*err) | ||
335 | goto failed_out; | ||
336 | |||
337 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
338 | EXT4_ERROR_INODE(inode, | ||
339 | "current_block %llu + count %lu > %d!", | ||
340 | current_block, count, | ||
341 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
342 | *err = -EIO; | ||
343 | goto failed_out; | ||
344 | } | ||
345 | |||
346 | target -= count; | ||
347 | /* allocate blocks for indirect blocks */ | ||
348 | while (index < indirect_blks && count) { | ||
349 | new_blocks[index++] = current_block++; | ||
350 | count--; | ||
351 | } | ||
352 | if (count > 0) { | ||
353 | /* | ||
354 | * save the new block number | ||
355 | * for the first direct block | ||
356 | */ | ||
357 | new_blocks[index] = current_block; | ||
358 | printk(KERN_INFO "%s returned more blocks than " | ||
359 | "requested\n", __func__); | ||
360 | WARN_ON(1); | ||
361 | break; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | target = blks - count ; | ||
366 | blk_allocated = count; | ||
367 | if (!target) | ||
368 | goto allocated; | ||
369 | /* Now allocate data blocks */ | ||
370 | memset(&ar, 0, sizeof(ar)); | ||
371 | ar.inode = inode; | ||
372 | ar.goal = goal; | ||
373 | ar.len = target; | ||
374 | ar.logical = iblock; | ||
375 | if (S_ISREG(inode->i_mode)) | ||
376 | /* enable in-core preallocation only for regular files */ | ||
377 | ar.flags = EXT4_MB_HINT_DATA; | ||
378 | |||
379 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
380 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
381 | EXT4_ERROR_INODE(inode, | ||
382 | "current_block %llu + ar.len %d > %d!", | ||
383 | current_block, ar.len, | ||
384 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
385 | *err = -EIO; | ||
386 | goto failed_out; | ||
387 | } | ||
388 | |||
389 | if (*err && (target == blks)) { | ||
390 | /* | ||
391 | * if the allocation failed and we didn't allocate | ||
392 | * any blocks before | ||
393 | */ | ||
394 | goto failed_out; | ||
395 | } | ||
396 | if (!*err) { | ||
397 | if (target == blks) { | ||
398 | /* | ||
399 | * save the new block number | ||
400 | * for the first direct block | ||
401 | */ | ||
402 | new_blocks[index] = current_block; | ||
403 | } | ||
404 | blk_allocated += ar.len; | ||
405 | } | ||
406 | allocated: | ||
407 | /* total number of blocks allocated for direct blocks */ | ||
408 | ret = blk_allocated; | ||
409 | *err = 0; | ||
410 | return ret; | ||
411 | failed_out: | ||
412 | for (i = 0; i < index; i++) | ||
413 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | /** | ||
418 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
419 | * @handle: handle for this transaction | ||
420 | * @inode: owner | ||
421 | * @indirect_blks: number of allocated indirect blocks | ||
422 | * @blks: number of allocated direct blocks | ||
423 | * @goal: preferred place for allocation | ||
424 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
425 | * @branch: place to store the chain in. | ||
426 | * | ||
427 | * This function allocates blocks, zeroes out all but the last one, | ||
428 | * links them into chain and (if we are synchronous) writes them to disk. | ||
429 | * In other words, it prepares a branch that can be spliced onto the | ||
430 | * inode. It stores the information about that chain in the branch[], in | ||
431 | * the same format as ext4_get_branch() would do. We are calling it after | ||
432 | * we had read the existing part of chain and partial points to the last | ||
433 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
434 | * picture as after the successful ext4_get_block(), except that in one | ||
435 | * place chain is disconnected - *branch->p is still zero (we did not | ||
436 | * set the last link), but branch->key contains the number that should | ||
437 | * be placed into *branch->p to fill that gap. | ||
438 | * | ||
439 | * If allocation fails we free all blocks we've allocated (and forget | ||
440 | * their buffer_heads) and return the error value the from failed | ||
441 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
442 | * as described above and return 0. | ||
443 | */ | ||
444 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
445 | ext4_lblk_t iblock, int indirect_blks, | ||
446 | int *blks, ext4_fsblk_t goal, | ||
447 | ext4_lblk_t *offsets, Indirect *branch) | ||
448 | { | ||
449 | int blocksize = inode->i_sb->s_blocksize; | ||
450 | int i, n = 0; | ||
451 | int err = 0; | ||
452 | struct buffer_head *bh; | ||
453 | int num; | ||
454 | ext4_fsblk_t new_blocks[4]; | ||
455 | ext4_fsblk_t current_block; | ||
456 | |||
457 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
458 | *blks, new_blocks, &err); | ||
459 | if (err) | ||
460 | return err; | ||
461 | |||
462 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
463 | /* | ||
464 | * metadata blocks and data blocks are allocated. | ||
465 | */ | ||
466 | for (n = 1; n <= indirect_blks; n++) { | ||
467 | /* | ||
468 | * Get buffer_head for parent block, zero it out | ||
469 | * and set the pointer to new one, then send | ||
470 | * parent to disk. | ||
471 | */ | ||
472 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
473 | if (unlikely(!bh)) { | ||
474 | err = -EIO; | ||
475 | goto failed; | ||
476 | } | ||
477 | |||
478 | branch[n].bh = bh; | ||
479 | lock_buffer(bh); | ||
480 | BUFFER_TRACE(bh, "call get_create_access"); | ||
481 | err = ext4_journal_get_create_access(handle, bh); | ||
482 | if (err) { | ||
483 | /* Don't brelse(bh) here; it's done in | ||
484 | * ext4_journal_forget() below */ | ||
485 | unlock_buffer(bh); | ||
486 | goto failed; | ||
487 | } | ||
488 | |||
489 | memset(bh->b_data, 0, blocksize); | ||
490 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
491 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
492 | *branch[n].p = branch[n].key; | ||
493 | if (n == indirect_blks) { | ||
494 | current_block = new_blocks[n]; | ||
495 | /* | ||
496 | * End of chain, update the last new metablock of | ||
497 | * the chain to point to the new allocated | ||
498 | * data blocks numbers | ||
499 | */ | ||
500 | for (i = 1; i < num; i++) | ||
501 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
502 | } | ||
503 | BUFFER_TRACE(bh, "marking uptodate"); | ||
504 | set_buffer_uptodate(bh); | ||
505 | unlock_buffer(bh); | ||
506 | |||
507 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
508 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
509 | if (err) | ||
510 | goto failed; | ||
511 | } | ||
512 | *blks = num; | ||
513 | return err; | ||
514 | failed: | ||
515 | /* Allocation failed, free what we already allocated */ | ||
516 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
517 | for (i = 1; i <= n ; i++) { | ||
518 | /* | ||
519 | * branch[i].bh is newly allocated, so there is no | ||
520 | * need to revoke the block, which is why we don't | ||
521 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
522 | */ | ||
523 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
524 | EXT4_FREE_BLOCKS_FORGET); | ||
525 | } | ||
526 | for (i = n+1; i < indirect_blks; i++) | ||
527 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
528 | |||
529 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
530 | |||
531 | return err; | ||
532 | } | ||
533 | |||
534 | /** | ||
535 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
536 | * @handle: handle for this transaction | ||
537 | * @inode: owner | ||
538 | * @block: (logical) number of block we are adding | ||
539 | * @chain: chain of indirect blocks (with a missing link - see | ||
540 | * ext4_alloc_branch) | ||
541 | * @where: location of missing link | ||
542 | * @num: number of indirect blocks we are adding | ||
543 | * @blks: number of direct blocks we are adding | ||
544 | * | ||
545 | * This function fills the missing link and does all housekeeping needed in | ||
546 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
547 | * chain to new block and return 0. | ||
548 | */ | ||
549 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
550 | ext4_lblk_t block, Indirect *where, int num, | ||
551 | int blks) | ||
552 | { | ||
553 | int i; | ||
554 | int err = 0; | ||
555 | ext4_fsblk_t current_block; | ||
556 | |||
557 | /* | ||
558 | * If we're splicing into a [td]indirect block (as opposed to the | ||
559 | * inode) then we need to get write access to the [td]indirect block | ||
560 | * before the splice. | ||
561 | */ | ||
562 | if (where->bh) { | ||
563 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
564 | err = ext4_journal_get_write_access(handle, where->bh); | ||
565 | if (err) | ||
566 | goto err_out; | ||
567 | } | ||
568 | /* That's it */ | ||
569 | |||
570 | *where->p = where->key; | ||
571 | |||
572 | /* | ||
573 | * Update the host buffer_head or inode to point to more just allocated | ||
574 | * direct blocks blocks | ||
575 | */ | ||
576 | if (num == 0 && blks > 1) { | ||
577 | current_block = le32_to_cpu(where->key) + 1; | ||
578 | for (i = 1; i < blks; i++) | ||
579 | *(where->p + i) = cpu_to_le32(current_block++); | ||
580 | } | ||
581 | |||
582 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
583 | /* had we spliced it onto indirect block? */ | ||
584 | if (where->bh) { | ||
585 | /* | ||
586 | * If we spliced it onto an indirect block, we haven't | ||
587 | * altered the inode. Note however that if it is being spliced | ||
588 | * onto an indirect block at the very end of the file (the | ||
589 | * file is growing) then we *will* alter the inode to reflect | ||
590 | * the new i_size. But that is not done here - it is done in | ||
591 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
592 | */ | ||
593 | jbd_debug(5, "splicing indirect only\n"); | ||
594 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
595 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
596 | if (err) | ||
597 | goto err_out; | ||
598 | } else { | ||
599 | /* | ||
600 | * OK, we spliced it into the inode itself on a direct block. | ||
601 | */ | ||
602 | ext4_mark_inode_dirty(handle, inode); | ||
603 | jbd_debug(5, "splicing direct\n"); | ||
604 | } | ||
605 | return err; | ||
606 | |||
607 | err_out: | ||
608 | for (i = 1; i <= num; i++) { | ||
609 | /* | ||
610 | * branch[i].bh is newly allocated, so there is no | ||
611 | * need to revoke the block, which is why we don't | ||
612 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
613 | */ | ||
614 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
615 | EXT4_FREE_BLOCKS_FORGET); | ||
616 | } | ||
617 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
618 | blks, 0); | ||
619 | |||
620 | return err; | ||
621 | } | ||
622 | |||
623 | /* | ||
624 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
625 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
626 | * scheme) for ext4_map_blocks(). | ||
627 | * | ||
628 | * Allocation strategy is simple: if we have to allocate something, we will | ||
629 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
630 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
631 | * required, recheck the path, free and repeat if check fails, otherwise | ||
632 | * set the last missing link (that will protect us from any truncate-generated | ||
633 | * removals - all blocks on the path are immune now) and possibly force the | ||
634 | * write on the parent block. | ||
635 | * That has a nice additional property: no special recovery from the failed | ||
636 | * allocations is needed - we simply release blocks and do not touch anything | ||
637 | * reachable from inode. | ||
638 | * | ||
639 | * `handle' can be NULL if create == 0. | ||
640 | * | ||
641 | * return > 0, # of blocks mapped or allocated. | ||
642 | * return = 0, if plain lookup failed. | ||
643 | * return < 0, error case. | ||
644 | * | ||
645 | * The ext4_ind_get_blocks() function should be called with | ||
646 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
647 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
648 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
649 | * blocks. | ||
650 | */ | ||
651 | int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
652 | struct ext4_map_blocks *map, | ||
653 | int flags) | ||
654 | { | ||
655 | int err = -EIO; | ||
656 | ext4_lblk_t offsets[4]; | ||
657 | Indirect chain[4]; | ||
658 | Indirect *partial; | ||
659 | ext4_fsblk_t goal; | ||
660 | int indirect_blks; | ||
661 | int blocks_to_boundary = 0; | ||
662 | int depth; | ||
663 | int count = 0; | ||
664 | ext4_fsblk_t first_block = 0; | ||
665 | |||
666 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
667 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
668 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
669 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
670 | &blocks_to_boundary); | ||
671 | |||
672 | if (depth == 0) | ||
673 | goto out; | ||
674 | |||
675 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
676 | |||
677 | /* Simplest case - block found, no allocation needed */ | ||
678 | if (!partial) { | ||
679 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
680 | count++; | ||
681 | /*map more blocks*/ | ||
682 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
683 | ext4_fsblk_t blk; | ||
684 | |||
685 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
686 | |||
687 | if (blk == first_block + count) | ||
688 | count++; | ||
689 | else | ||
690 | break; | ||
691 | } | ||
692 | goto got_it; | ||
693 | } | ||
694 | |||
695 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
696 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
697 | goto cleanup; | ||
698 | |||
699 | /* | ||
700 | * Okay, we need to do block allocation. | ||
701 | */ | ||
702 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
703 | |||
704 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
705 | indirect_blks = (chain + depth) - partial - 1; | ||
706 | |||
707 | /* | ||
708 | * Next look up the indirect map to count the totoal number of | ||
709 | * direct blocks to allocate for this branch. | ||
710 | */ | ||
711 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
712 | map->m_len, blocks_to_boundary); | ||
713 | /* | ||
714 | * Block out ext4_truncate while we alter the tree | ||
715 | */ | ||
716 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
717 | &count, goal, | ||
718 | offsets + (partial - chain), partial); | ||
719 | |||
720 | /* | ||
721 | * The ext4_splice_branch call will free and forget any buffers | ||
722 | * on the new chain if there is a failure, but that risks using | ||
723 | * up transaction credits, especially for bitmaps where the | ||
724 | * credits cannot be returned. Can we handle this somehow? We | ||
725 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
726 | */ | ||
727 | if (!err) | ||
728 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
729 | partial, indirect_blks, count); | ||
730 | if (err) | ||
731 | goto cleanup; | ||
732 | |||
733 | map->m_flags |= EXT4_MAP_NEW; | ||
734 | |||
735 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
736 | got_it: | ||
737 | map->m_flags |= EXT4_MAP_MAPPED; | ||
738 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
739 | map->m_len = count; | ||
740 | if (count > blocks_to_boundary) | ||
741 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
742 | err = count; | ||
743 | /* Clean up and exit */ | ||
744 | partial = chain + depth - 1; /* the whole chain */ | ||
745 | cleanup: | ||
746 | while (partial > chain) { | ||
747 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
748 | brelse(partial->bh); | ||
749 | partial--; | ||
750 | } | ||
751 | out: | ||
752 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
753 | map->m_pblk, map->m_len, err); | ||
754 | return err; | ||
755 | } | ||
756 | |||
757 | /* | ||
758 | * O_DIRECT for ext3 (or indirect map) based files | ||
759 | * | ||
760 | * If the O_DIRECT write will extend the file then add this inode to the | ||
761 | * orphan list. So recovery will truncate it back to the original size | ||
762 | * if the machine crashes during the write. | ||
763 | * | ||
764 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
765 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
766 | * VFS code falls back into buffered path in that case so we are safe. | ||
767 | */ | ||
768 | ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
769 | const struct iovec *iov, loff_t offset, | ||
770 | unsigned long nr_segs) | ||
771 | { | ||
772 | struct file *file = iocb->ki_filp; | ||
773 | struct inode *inode = file->f_mapping->host; | ||
774 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
775 | handle_t *handle; | ||
776 | ssize_t ret; | ||
777 | int orphan = 0; | ||
778 | size_t count = iov_length(iov, nr_segs); | ||
779 | int retries = 0; | ||
780 | |||
781 | if (rw == WRITE) { | ||
782 | loff_t final_size = offset + count; | ||
783 | |||
784 | if (final_size > inode->i_size) { | ||
785 | /* Credits for sb + inode write */ | ||
786 | handle = ext4_journal_start(inode, 2); | ||
787 | if (IS_ERR(handle)) { | ||
788 | ret = PTR_ERR(handle); | ||
789 | goto out; | ||
790 | } | ||
791 | ret = ext4_orphan_add(handle, inode); | ||
792 | if (ret) { | ||
793 | ext4_journal_stop(handle); | ||
794 | goto out; | ||
795 | } | ||
796 | orphan = 1; | ||
797 | ei->i_disksize = inode->i_size; | ||
798 | ext4_journal_stop(handle); | ||
799 | } | ||
800 | } | ||
801 | |||
802 | retry: | ||
803 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
804 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
805 | inode->i_sb->s_bdev, iov, | ||
806 | offset, nr_segs, | ||
807 | ext4_get_block, NULL, NULL, 0); | ||
808 | else { | ||
809 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
810 | offset, nr_segs, ext4_get_block); | ||
811 | |||
812 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
813 | loff_t isize = i_size_read(inode); | ||
814 | loff_t end = offset + iov_length(iov, nr_segs); | ||
815 | |||
816 | if (end > isize) | ||
817 | ext4_truncate_failed_write(inode); | ||
818 | } | ||
819 | } | ||
820 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
821 | goto retry; | ||
822 | |||
823 | if (orphan) { | ||
824 | int err; | ||
825 | |||
826 | /* Credits for sb + inode write */ | ||
827 | handle = ext4_journal_start(inode, 2); | ||
828 | if (IS_ERR(handle)) { | ||
829 | /* This is really bad luck. We've written the data | ||
830 | * but cannot extend i_size. Bail out and pretend | ||
831 | * the write failed... */ | ||
832 | ret = PTR_ERR(handle); | ||
833 | if (inode->i_nlink) | ||
834 | ext4_orphan_del(NULL, inode); | ||
835 | |||
836 | goto out; | ||
837 | } | ||
838 | if (inode->i_nlink) | ||
839 | ext4_orphan_del(handle, inode); | ||
840 | if (ret > 0) { | ||
841 | loff_t end = offset + ret; | ||
842 | if (end > inode->i_size) { | ||
843 | ei->i_disksize = end; | ||
844 | i_size_write(inode, end); | ||
845 | /* | ||
846 | * We're going to return a positive `ret' | ||
847 | * here due to non-zero-length I/O, so there's | ||
848 | * no way of reporting error returns from | ||
849 | * ext4_mark_inode_dirty() to userspace. So | ||
850 | * ignore it. | ||
851 | */ | ||
852 | ext4_mark_inode_dirty(handle, inode); | ||
853 | } | ||
854 | } | ||
855 | err = ext4_journal_stop(handle); | ||
856 | if (ret == 0) | ||
857 | ret = err; | ||
858 | } | ||
859 | out: | ||
860 | return ret; | ||
861 | } | ||
862 | |||
863 | /* | ||
864 | * Calculate the number of metadata blocks need to reserve | ||
865 | * to allocate a new block at @lblocks for non extent file based file | ||
866 | */ | ||
867 | int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock) | ||
868 | { | ||
869 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
870 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
871 | int blk_bits; | ||
872 | |||
873 | if (lblock < EXT4_NDIR_BLOCKS) | ||
874 | return 0; | ||
875 | |||
876 | lblock -= EXT4_NDIR_BLOCKS; | ||
877 | |||
878 | if (ei->i_da_metadata_calc_len && | ||
879 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
880 | ei->i_da_metadata_calc_len++; | ||
881 | return 0; | ||
882 | } | ||
883 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
884 | ei->i_da_metadata_calc_len = 1; | ||
885 | blk_bits = order_base_2(lblock); | ||
886 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
887 | } | ||
888 | |||
889 | int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) | ||
890 | { | ||
891 | int indirects; | ||
892 | |||
893 | /* if nrblocks are contiguous */ | ||
894 | if (chunk) { | ||
895 | /* | ||
896 | * With N contiguous data blocks, we need at most | ||
897 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
898 | * 2 dindirect blocks, and 1 tindirect block | ||
899 | */ | ||
900 | return DIV_ROUND_UP(nrblocks, | ||
901 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
902 | } | ||
903 | /* | ||
904 | * if nrblocks are not contiguous, worse case, each block touch | ||
905 | * a indirect block, and each indirect block touch a double indirect | ||
906 | * block, plus a triple indirect block | ||
907 | */ | ||
908 | indirects = nrblocks * 2 + 1; | ||
909 | return indirects; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
914 | * be able to restart the transaction at a conventient checkpoint to make | ||
915 | * sure we don't overflow the journal. | ||
916 | * | ||
917 | * start_transaction gets us a new handle for a truncate transaction, | ||
918 | * and extend_transaction tries to extend the existing one a bit. If | ||
919 | * extend fails, we need to propagate the failure up and restart the | ||
920 | * transaction in the top-level truncate loop. --sct | ||
921 | */ | ||
922 | static handle_t *start_transaction(struct inode *inode) | ||
923 | { | ||
924 | handle_t *result; | ||
925 | |||
926 | result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)); | ||
927 | if (!IS_ERR(result)) | ||
928 | return result; | ||
929 | |||
930 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
931 | return result; | ||
932 | } | ||
933 | |||
934 | /* | ||
935 | * Try to extend this transaction for the purposes of truncation. | ||
936 | * | ||
937 | * Returns 0 if we managed to create more room. If we can't create more | ||
938 | * room, and the transaction must be restarted we return 1. | ||
939 | */ | ||
940 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
941 | { | ||
942 | if (!ext4_handle_valid(handle)) | ||
943 | return 0; | ||
944 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
945 | return 0; | ||
946 | if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode))) | ||
947 | return 0; | ||
948 | return 1; | ||
949 | } | ||
950 | |||
951 | /* | ||
952 | * Probably it should be a library function... search for first non-zero word | ||
953 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
954 | * Linus? | ||
955 | */ | ||
956 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
957 | { | ||
958 | while (p < q) | ||
959 | if (*p++) | ||
960 | return 0; | ||
961 | return 1; | ||
962 | } | ||
963 | |||
964 | /** | ||
965 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
966 | * @inode: inode in question | ||
967 | * @depth: depth of the affected branch | ||
968 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
969 | * @chain: place to store the pointers to partial indirect blocks | ||
970 | * @top: place to the (detached) top of branch | ||
971 | * | ||
972 | * This is a helper function used by ext4_truncate(). | ||
973 | * | ||
974 | * When we do truncate() we may have to clean the ends of several | ||
975 | * indirect blocks but leave the blocks themselves alive. Block is | ||
976 | * partially truncated if some data below the new i_size is referred | ||
977 | * from it (and it is on the path to the first completely truncated | ||
978 | * data block, indeed). We have to free the top of that path along | ||
979 | * with everything to the right of the path. Since no allocation | ||
980 | * past the truncation point is possible until ext4_truncate() | ||
981 | * finishes, we may safely do the latter, but top of branch may | ||
982 | * require special attention - pageout below the truncation point | ||
983 | * might try to populate it. | ||
984 | * | ||
985 | * We atomically detach the top of branch from the tree, store the | ||
986 | * block number of its root in *@top, pointers to buffer_heads of | ||
987 | * partially truncated blocks - in @chain[].bh and pointers to | ||
988 | * their last elements that should not be removed - in | ||
989 | * @chain[].p. Return value is the pointer to last filled element | ||
990 | * of @chain. | ||
991 | * | ||
992 | * The work left to caller to do the actual freeing of subtrees: | ||
993 | * a) free the subtree starting from *@top | ||
994 | * b) free the subtrees whose roots are stored in | ||
995 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
996 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
997 | * (no partially truncated stuff there). */ | ||
998 | |||
999 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
1000 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
1001 | __le32 *top) | ||
1002 | { | ||
1003 | Indirect *partial, *p; | ||
1004 | int k, err; | ||
1005 | |||
1006 | *top = 0; | ||
1007 | /* Make k index the deepest non-null offset + 1 */ | ||
1008 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
1009 | ; | ||
1010 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
1011 | /* Writer: pointers */ | ||
1012 | if (!partial) | ||
1013 | partial = chain + k-1; | ||
1014 | /* | ||
1015 | * If the branch acquired continuation since we've looked at it - | ||
1016 | * fine, it should all survive and (new) top doesn't belong to us. | ||
1017 | */ | ||
1018 | if (!partial->key && *partial->p) | ||
1019 | /* Writer: end */ | ||
1020 | goto no_top; | ||
1021 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
1022 | ; | ||
1023 | /* | ||
1024 | * OK, we've found the last block that must survive. The rest of our | ||
1025 | * branch should be detached before unlocking. However, if that rest | ||
1026 | * of branch is all ours and does not grow immediately from the inode | ||
1027 | * it's easier to cheat and just decrement partial->p. | ||
1028 | */ | ||
1029 | if (p == chain + k - 1 && p > chain) { | ||
1030 | p->p--; | ||
1031 | } else { | ||
1032 | *top = *p->p; | ||
1033 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
1034 | #if 0 | ||
1035 | *p->p = 0; | ||
1036 | #endif | ||
1037 | } | ||
1038 | /* Writer: end */ | ||
1039 | |||
1040 | while (partial > p) { | ||
1041 | brelse(partial->bh); | ||
1042 | partial--; | ||
1043 | } | ||
1044 | no_top: | ||
1045 | return partial; | ||
1046 | } | ||
1047 | |||
1048 | /* | ||
1049 | * Zero a number of block pointers in either an inode or an indirect block. | ||
1050 | * If we restart the transaction we must again get write access to the | ||
1051 | * indirect block for further modification. | ||
1052 | * | ||
1053 | * We release `count' blocks on disk, but (last - first) may be greater | ||
1054 | * than `count' because there can be holes in there. | ||
1055 | * | ||
1056 | * Return 0 on success, 1 on invalid block range | ||
1057 | * and < 0 on fatal error. | ||
1058 | */ | ||
1059 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
1060 | struct buffer_head *bh, | ||
1061 | ext4_fsblk_t block_to_free, | ||
1062 | unsigned long count, __le32 *first, | ||
1063 | __le32 *last) | ||
1064 | { | ||
1065 | __le32 *p; | ||
1066 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
1067 | int err; | ||
1068 | |||
1069 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
1070 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
1071 | |||
1072 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
1073 | count)) { | ||
1074 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
1075 | "blocks %llu len %lu", | ||
1076 | (unsigned long long) block_to_free, count); | ||
1077 | return 1; | ||
1078 | } | ||
1079 | |||
1080 | if (try_to_extend_transaction(handle, inode)) { | ||
1081 | if (bh) { | ||
1082 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
1083 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
1084 | if (unlikely(err)) | ||
1085 | goto out_err; | ||
1086 | } | ||
1087 | err = ext4_mark_inode_dirty(handle, inode); | ||
1088 | if (unlikely(err)) | ||
1089 | goto out_err; | ||
1090 | err = ext4_truncate_restart_trans(handle, inode, | ||
1091 | ext4_blocks_for_truncate(inode)); | ||
1092 | if (unlikely(err)) | ||
1093 | goto out_err; | ||
1094 | if (bh) { | ||
1095 | BUFFER_TRACE(bh, "retaking write access"); | ||
1096 | err = ext4_journal_get_write_access(handle, bh); | ||
1097 | if (unlikely(err)) | ||
1098 | goto out_err; | ||
1099 | } | ||
1100 | } | ||
1101 | |||
1102 | for (p = first; p < last; p++) | ||
1103 | *p = 0; | ||
1104 | |||
1105 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
1106 | return 0; | ||
1107 | out_err: | ||
1108 | ext4_std_error(inode->i_sb, err); | ||
1109 | return err; | ||
1110 | } | ||
1111 | |||
1112 | /** | ||
1113 | * ext4_free_data - free a list of data blocks | ||
1114 | * @handle: handle for this transaction | ||
1115 | * @inode: inode we are dealing with | ||
1116 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
1117 | * @first: array of block numbers | ||
1118 | * @last: points immediately past the end of array | ||
1119 | * | ||
1120 | * We are freeing all blocks referred from that array (numbers are stored as | ||
1121 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
1122 | * | ||
1123 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
1124 | * blocks are contiguous then releasing them at one time will only affect one | ||
1125 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
1126 | * actually use a lot of journal space. | ||
1127 | * | ||
1128 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
1129 | * block pointers. | ||
1130 | */ | ||
1131 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
1132 | struct buffer_head *this_bh, | ||
1133 | __le32 *first, __le32 *last) | ||
1134 | { | ||
1135 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
1136 | unsigned long count = 0; /* Number of blocks in the run */ | ||
1137 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
1138 | corresponding to | ||
1139 | block_to_free */ | ||
1140 | ext4_fsblk_t nr; /* Current block # */ | ||
1141 | __le32 *p; /* Pointer into inode/ind | ||
1142 | for current block */ | ||
1143 | int err = 0; | ||
1144 | |||
1145 | if (this_bh) { /* For indirect block */ | ||
1146 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
1147 | err = ext4_journal_get_write_access(handle, this_bh); | ||
1148 | /* Important: if we can't update the indirect pointers | ||
1149 | * to the blocks, we can't free them. */ | ||
1150 | if (err) | ||
1151 | return; | ||
1152 | } | ||
1153 | |||
1154 | for (p = first; p < last; p++) { | ||
1155 | nr = le32_to_cpu(*p); | ||
1156 | if (nr) { | ||
1157 | /* accumulate blocks to free if they're contiguous */ | ||
1158 | if (count == 0) { | ||
1159 | block_to_free = nr; | ||
1160 | block_to_free_p = p; | ||
1161 | count = 1; | ||
1162 | } else if (nr == block_to_free + count) { | ||
1163 | count++; | ||
1164 | } else { | ||
1165 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
1166 | block_to_free, count, | ||
1167 | block_to_free_p, p); | ||
1168 | if (err) | ||
1169 | break; | ||
1170 | block_to_free = nr; | ||
1171 | block_to_free_p = p; | ||
1172 | count = 1; | ||
1173 | } | ||
1174 | } | ||
1175 | } | ||
1176 | |||
1177 | if (!err && count > 0) | ||
1178 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
1179 | count, block_to_free_p, p); | ||
1180 | if (err < 0) | ||
1181 | /* fatal error */ | ||
1182 | return; | ||
1183 | |||
1184 | if (this_bh) { | ||
1185 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
1186 | |||
1187 | /* | ||
1188 | * The buffer head should have an attached journal head at this | ||
1189 | * point. However, if the data is corrupted and an indirect | ||
1190 | * block pointed to itself, it would have been detached when | ||
1191 | * the block was cleared. Check for this instead of OOPSing. | ||
1192 | */ | ||
1193 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
1194 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
1195 | else | ||
1196 | EXT4_ERROR_INODE(inode, | ||
1197 | "circular indirect block detected at " | ||
1198 | "block %llu", | ||
1199 | (unsigned long long) this_bh->b_blocknr); | ||
1200 | } | ||
1201 | } | ||
1202 | |||
1203 | /** | ||
1204 | * ext4_free_branches - free an array of branches | ||
1205 | * @handle: JBD handle for this transaction | ||
1206 | * @inode: inode we are dealing with | ||
1207 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
1208 | * @first: array of block numbers | ||
1209 | * @last: pointer immediately past the end of array | ||
1210 | * @depth: depth of the branches to free | ||
1211 | * | ||
1212 | * We are freeing all blocks referred from these branches (numbers are | ||
1213 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
1214 | * appropriately. | ||
1215 | */ | ||
1216 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
1217 | struct buffer_head *parent_bh, | ||
1218 | __le32 *first, __le32 *last, int depth) | ||
1219 | { | ||
1220 | ext4_fsblk_t nr; | ||
1221 | __le32 *p; | ||
1222 | |||
1223 | if (ext4_handle_is_aborted(handle)) | ||
1224 | return; | ||
1225 | |||
1226 | if (depth--) { | ||
1227 | struct buffer_head *bh; | ||
1228 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1229 | p = last; | ||
1230 | while (--p >= first) { | ||
1231 | nr = le32_to_cpu(*p); | ||
1232 | if (!nr) | ||
1233 | continue; /* A hole */ | ||
1234 | |||
1235 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
1236 | nr, 1)) { | ||
1237 | EXT4_ERROR_INODE(inode, | ||
1238 | "invalid indirect mapped " | ||
1239 | "block %lu (level %d)", | ||
1240 | (unsigned long) nr, depth); | ||
1241 | break; | ||
1242 | } | ||
1243 | |||
1244 | /* Go read the buffer for the next level down */ | ||
1245 | bh = sb_bread(inode->i_sb, nr); | ||
1246 | |||
1247 | /* | ||
1248 | * A read failure? Report error and clear slot | ||
1249 | * (should be rare). | ||
1250 | */ | ||
1251 | if (!bh) { | ||
1252 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
1253 | "Read failure"); | ||
1254 | continue; | ||
1255 | } | ||
1256 | |||
1257 | /* This zaps the entire block. Bottom up. */ | ||
1258 | BUFFER_TRACE(bh, "free child branches"); | ||
1259 | ext4_free_branches(handle, inode, bh, | ||
1260 | (__le32 *) bh->b_data, | ||
1261 | (__le32 *) bh->b_data + addr_per_block, | ||
1262 | depth); | ||
1263 | brelse(bh); | ||
1264 | |||
1265 | /* | ||
1266 | * Everything below this this pointer has been | ||
1267 | * released. Now let this top-of-subtree go. | ||
1268 | * | ||
1269 | * We want the freeing of this indirect block to be | ||
1270 | * atomic in the journal with the updating of the | ||
1271 | * bitmap block which owns it. So make some room in | ||
1272 | * the journal. | ||
1273 | * | ||
1274 | * We zero the parent pointer *after* freeing its | ||
1275 | * pointee in the bitmaps, so if extend_transaction() | ||
1276 | * for some reason fails to put the bitmap changes and | ||
1277 | * the release into the same transaction, recovery | ||
1278 | * will merely complain about releasing a free block, | ||
1279 | * rather than leaking blocks. | ||
1280 | */ | ||
1281 | if (ext4_handle_is_aborted(handle)) | ||
1282 | return; | ||
1283 | if (try_to_extend_transaction(handle, inode)) { | ||
1284 | ext4_mark_inode_dirty(handle, inode); | ||
1285 | ext4_truncate_restart_trans(handle, inode, | ||
1286 | ext4_blocks_for_truncate(inode)); | ||
1287 | } | ||
1288 | |||
1289 | /* | ||
1290 | * The forget flag here is critical because if | ||
1291 | * we are journaling (and not doing data | ||
1292 | * journaling), we have to make sure a revoke | ||
1293 | * record is written to prevent the journal | ||
1294 | * replay from overwriting the (former) | ||
1295 | * indirect block if it gets reallocated as a | ||
1296 | * data block. This must happen in the same | ||
1297 | * transaction where the data blocks are | ||
1298 | * actually freed. | ||
1299 | */ | ||
1300 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
1301 | EXT4_FREE_BLOCKS_METADATA| | ||
1302 | EXT4_FREE_BLOCKS_FORGET); | ||
1303 | |||
1304 | if (parent_bh) { | ||
1305 | /* | ||
1306 | * The block which we have just freed is | ||
1307 | * pointed to by an indirect block: journal it | ||
1308 | */ | ||
1309 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
1310 | if (!ext4_journal_get_write_access(handle, | ||
1311 | parent_bh)){ | ||
1312 | *p = 0; | ||
1313 | BUFFER_TRACE(parent_bh, | ||
1314 | "call ext4_handle_dirty_metadata"); | ||
1315 | ext4_handle_dirty_metadata(handle, | ||
1316 | inode, | ||
1317 | parent_bh); | ||
1318 | } | ||
1319 | } | ||
1320 | } | ||
1321 | } else { | ||
1322 | /* We have reached the bottom of the tree. */ | ||
1323 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
1324 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
1325 | } | ||
1326 | } | ||
1327 | |||
1328 | void ext4_ind_truncate(struct inode *inode) | ||
1329 | { | ||
1330 | handle_t *handle; | ||
1331 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1332 | __le32 *i_data = ei->i_data; | ||
1333 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1334 | struct address_space *mapping = inode->i_mapping; | ||
1335 | ext4_lblk_t offsets[4]; | ||
1336 | Indirect chain[4]; | ||
1337 | Indirect *partial; | ||
1338 | __le32 nr = 0; | ||
1339 | int n = 0; | ||
1340 | ext4_lblk_t last_block, max_block; | ||
1341 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
1342 | |||
1343 | handle = start_transaction(inode); | ||
1344 | if (IS_ERR(handle)) | ||
1345 | return; /* AKPM: return what? */ | ||
1346 | |||
1347 | last_block = (inode->i_size + blocksize-1) | ||
1348 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1349 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
1350 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
1351 | |||
1352 | if (inode->i_size & (blocksize - 1)) | ||
1353 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
1354 | goto out_stop; | ||
1355 | |||
1356 | if (last_block != max_block) { | ||
1357 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
1358 | if (n == 0) | ||
1359 | goto out_stop; /* error */ | ||
1360 | } | ||
1361 | |||
1362 | /* | ||
1363 | * OK. This truncate is going to happen. We add the inode to the | ||
1364 | * orphan list, so that if this truncate spans multiple transactions, | ||
1365 | * and we crash, we will resume the truncate when the filesystem | ||
1366 | * recovers. It also marks the inode dirty, to catch the new size. | ||
1367 | * | ||
1368 | * Implication: the file must always be in a sane, consistent | ||
1369 | * truncatable state while each transaction commits. | ||
1370 | */ | ||
1371 | if (ext4_orphan_add(handle, inode)) | ||
1372 | goto out_stop; | ||
1373 | |||
1374 | /* | ||
1375 | * From here we block out all ext4_get_block() callers who want to | ||
1376 | * modify the block allocation tree. | ||
1377 | */ | ||
1378 | down_write(&ei->i_data_sem); | ||
1379 | |||
1380 | ext4_discard_preallocations(inode); | ||
1381 | |||
1382 | /* | ||
1383 | * The orphan list entry will now protect us from any crash which | ||
1384 | * occurs before the truncate completes, so it is now safe to propagate | ||
1385 | * the new, shorter inode size (held for now in i_size) into the | ||
1386 | * on-disk inode. We do this via i_disksize, which is the value which | ||
1387 | * ext4 *really* writes onto the disk inode. | ||
1388 | */ | ||
1389 | ei->i_disksize = inode->i_size; | ||
1390 | |||
1391 | if (last_block == max_block) { | ||
1392 | /* | ||
1393 | * It is unnecessary to free any data blocks if last_block is | ||
1394 | * equal to the indirect block limit. | ||
1395 | */ | ||
1396 | goto out_unlock; | ||
1397 | } else if (n == 1) { /* direct blocks */ | ||
1398 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
1399 | i_data + EXT4_NDIR_BLOCKS); | ||
1400 | goto do_indirects; | ||
1401 | } | ||
1402 | |||
1403 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
1404 | /* Kill the top of shared branch (not detached) */ | ||
1405 | if (nr) { | ||
1406 | if (partial == chain) { | ||
1407 | /* Shared branch grows from the inode */ | ||
1408 | ext4_free_branches(handle, inode, NULL, | ||
1409 | &nr, &nr+1, (chain+n-1) - partial); | ||
1410 | *partial->p = 0; | ||
1411 | /* | ||
1412 | * We mark the inode dirty prior to restart, | ||
1413 | * and prior to stop. No need for it here. | ||
1414 | */ | ||
1415 | } else { | ||
1416 | /* Shared branch grows from an indirect block */ | ||
1417 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
1418 | ext4_free_branches(handle, inode, partial->bh, | ||
1419 | partial->p, | ||
1420 | partial->p+1, (chain+n-1) - partial); | ||
1421 | } | ||
1422 | } | ||
1423 | /* Clear the ends of indirect blocks on the shared branch */ | ||
1424 | while (partial > chain) { | ||
1425 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
1426 | (__le32*)partial->bh->b_data+addr_per_block, | ||
1427 | (chain+n-1) - partial); | ||
1428 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1429 | brelse(partial->bh); | ||
1430 | partial--; | ||
1431 | } | ||
1432 | do_indirects: | ||
1433 | /* Kill the remaining (whole) subtrees */ | ||
1434 | switch (offsets[0]) { | ||
1435 | default: | ||
1436 | nr = i_data[EXT4_IND_BLOCK]; | ||
1437 | if (nr) { | ||
1438 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
1439 | i_data[EXT4_IND_BLOCK] = 0; | ||
1440 | } | ||
1441 | case EXT4_IND_BLOCK: | ||
1442 | nr = i_data[EXT4_DIND_BLOCK]; | ||
1443 | if (nr) { | ||
1444 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
1445 | i_data[EXT4_DIND_BLOCK] = 0; | ||
1446 | } | ||
1447 | case EXT4_DIND_BLOCK: | ||
1448 | nr = i_data[EXT4_TIND_BLOCK]; | ||
1449 | if (nr) { | ||
1450 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
1451 | i_data[EXT4_TIND_BLOCK] = 0; | ||
1452 | } | ||
1453 | case EXT4_TIND_BLOCK: | ||
1454 | ; | ||
1455 | } | ||
1456 | |||
1457 | out_unlock: | ||
1458 | up_write(&ei->i_data_sem); | ||
1459 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
1460 | ext4_mark_inode_dirty(handle, inode); | ||
1461 | |||
1462 | /* | ||
1463 | * In a multi-transaction truncate, we only make the final transaction | ||
1464 | * synchronous | ||
1465 | */ | ||
1466 | if (IS_SYNC(inode)) | ||
1467 | ext4_handle_sync(handle); | ||
1468 | out_stop: | ||
1469 | /* | ||
1470 | * If this was a simple ftruncate(), and the file will remain alive | ||
1471 | * then we need to clear up the orphan record which we created above. | ||
1472 | * However, if this was a real unlink then we were called by | ||
1473 | * ext4_delete_inode(), and we allow that function to clean up the | ||
1474 | * orphan info for us. | ||
1475 | */ | ||
1476 | if (inode->i_nlink) | ||
1477 | ext4_orphan_del(handle, inode); | ||
1478 | |||
1479 | ext4_journal_stop(handle); | ||
1480 | trace_ext4_truncate_exit(inode); | ||
1481 | } | ||
1482 | |||
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3e5191f9f398..d47264cafee0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -12,10 +12,6 @@ | |||
12 | * | 12 | * |
13 | * Copyright (C) 1991, 1992 Linus Torvalds | 13 | * Copyright (C) 1991, 1992 Linus Torvalds |
14 | * | 14 | * |
15 | * Goal-directed block allocation by Stephen Tweedie | ||
16 | * (sct@redhat.com), 1993, 1998 | ||
17 | * Big-endian to little-endian byte-swapping/bitmaps by | ||
18 | * David S. Miller (davem@caip.rutgers.edu), 1995 | ||
19 | * 64-bit file support on 64-bit platforms by Jakub Jelinek | 15 | * 64-bit file support on 64-bit platforms by Jakub Jelinek |
20 | * (jj@sunsite.ms.mff.cuni.cz) | 16 | * (jj@sunsite.ms.mff.cuni.cz) |
21 | * | 17 | * |
@@ -47,6 +43,7 @@ | |||
47 | #include "xattr.h" | 43 | #include "xattr.h" |
48 | #include "acl.h" | 44 | #include "acl.h" |
49 | #include "ext4_extents.h" | 45 | #include "ext4_extents.h" |
46 | #include "truncate.h" | ||
50 | 47 | ||
51 | #include <trace/events/ext4.h> | 48 | #include <trace/events/ext4.h> |
52 | 49 | ||
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) | |||
89 | } | 86 | } |
90 | 87 | ||
91 | /* | 88 | /* |
92 | * Work out how many blocks we need to proceed with the next chunk of a | ||
93 | * truncate transaction. | ||
94 | */ | ||
95 | static unsigned long blocks_for_truncate(struct inode *inode) | ||
96 | { | ||
97 | ext4_lblk_t needed; | ||
98 | |||
99 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
100 | |||
101 | /* Give ourselves just enough room to cope with inodes in which | ||
102 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
103 | * which resulted in random data in an inode which looked enough | ||
104 | * like a regular file for ext4 to try to delete it. Things | ||
105 | * will go a bit crazy if that happens, but at least we should | ||
106 | * try not to panic the whole kernel. */ | ||
107 | if (needed < 2) | ||
108 | needed = 2; | ||
109 | |||
110 | /* But we need to bound the transaction so we don't overflow the | ||
111 | * journal. */ | ||
112 | if (needed > EXT4_MAX_TRANS_DATA) | ||
113 | needed = EXT4_MAX_TRANS_DATA; | ||
114 | |||
115 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
116 | } | ||
117 | |||
118 | /* | ||
119 | * Truncate transactions can be complex and absolutely huge. So we need to | ||
120 | * be able to restart the transaction at a conventient checkpoint to make | ||
121 | * sure we don't overflow the journal. | ||
122 | * | ||
123 | * start_transaction gets us a new handle for a truncate transaction, | ||
124 | * and extend_transaction tries to extend the existing one a bit. If | ||
125 | * extend fails, we need to propagate the failure up and restart the | ||
126 | * transaction in the top-level truncate loop. --sct | ||
127 | */ | ||
128 | static handle_t *start_transaction(struct inode *inode) | ||
129 | { | ||
130 | handle_t *result; | ||
131 | |||
132 | result = ext4_journal_start(inode, blocks_for_truncate(inode)); | ||
133 | if (!IS_ERR(result)) | ||
134 | return result; | ||
135 | |||
136 | ext4_std_error(inode->i_sb, PTR_ERR(result)); | ||
137 | return result; | ||
138 | } | ||
139 | |||
140 | /* | ||
141 | * Try to extend this transaction for the purposes of truncation. | ||
142 | * | ||
143 | * Returns 0 if we managed to create more room. If we can't create more | ||
144 | * room, and the transaction must be restarted we return 1. | ||
145 | */ | ||
146 | static int try_to_extend_transaction(handle_t *handle, struct inode *inode) | ||
147 | { | ||
148 | if (!ext4_handle_valid(handle)) | ||
149 | return 0; | ||
150 | if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) | ||
151 | return 0; | ||
152 | if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) | ||
153 | return 0; | ||
154 | return 1; | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Restart the transaction associated with *handle. This does a commit, | 89 | * Restart the transaction associated with *handle. This does a commit, |
159 | * so before we call here everything must be consistently dirtied against | 90 | * so before we call here everything must be consistently dirtied against |
160 | * this transaction. | 91 | * this transaction. |
@@ -190,6 +121,33 @@ void ext4_evict_inode(struct inode *inode) | |||
190 | 121 | ||
191 | trace_ext4_evict_inode(inode); | 122 | trace_ext4_evict_inode(inode); |
192 | if (inode->i_nlink) { | 123 | if (inode->i_nlink) { |
124 | /* | ||
125 | * When journalling data dirty buffers are tracked only in the | ||
126 | * journal. So although mm thinks everything is clean and | ||
127 | * ready for reaping the inode might still have some pages to | ||
128 | * write in the running transaction or waiting to be | ||
129 | * checkpointed. Thus calling jbd2_journal_invalidatepage() | ||
130 | * (via truncate_inode_pages()) to discard these buffers can | ||
131 | * cause data loss. Also even if we did not discard these | ||
132 | * buffers, we would have no way to find them after the inode | ||
133 | * is reaped and thus user could see stale data if he tries to | ||
134 | * read them before the transaction is checkpointed. So be | ||
135 | * careful and force everything to disk here... We use | ||
136 | * ei->i_datasync_tid to store the newest transaction | ||
137 | * containing inode's data. | ||
138 | * | ||
139 | * Note that directories do not have this problem because they | ||
140 | * don't use page cache. | ||
141 | */ | ||
142 | if (ext4_should_journal_data(inode) && | ||
143 | (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { | ||
144 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
145 | tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; | ||
146 | |||
147 | jbd2_log_start_commit(journal, commit_tid); | ||
148 | jbd2_log_wait_commit(journal, commit_tid); | ||
149 | filemap_write_and_wait(&inode->i_data); | ||
150 | } | ||
193 | truncate_inode_pages(&inode->i_data, 0); | 151 | truncate_inode_pages(&inode->i_data, 0); |
194 | goto no_delete; | 152 | goto no_delete; |
195 | } | 153 | } |
@@ -204,7 +162,7 @@ void ext4_evict_inode(struct inode *inode) | |||
204 | if (is_bad_inode(inode)) | 162 | if (is_bad_inode(inode)) |
205 | goto no_delete; | 163 | goto no_delete; |
206 | 164 | ||
207 | handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | 165 | handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3); |
208 | if (IS_ERR(handle)) { | 166 | if (IS_ERR(handle)) { |
209 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); | 167 | ext4_std_error(inode->i_sb, PTR_ERR(handle)); |
210 | /* | 168 | /* |
@@ -277,793 +235,6 @@ no_delete: | |||
277 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ | 235 | ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ |
278 | } | 236 | } |
279 | 237 | ||
280 | typedef struct { | ||
281 | __le32 *p; | ||
282 | __le32 key; | ||
283 | struct buffer_head *bh; | ||
284 | } Indirect; | ||
285 | |||
286 | static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v) | ||
287 | { | ||
288 | p->key = *(p->p = v); | ||
289 | p->bh = bh; | ||
290 | } | ||
291 | |||
292 | /** | ||
293 | * ext4_block_to_path - parse the block number into array of offsets | ||
294 | * @inode: inode in question (we are only interested in its superblock) | ||
295 | * @i_block: block number to be parsed | ||
296 | * @offsets: array to store the offsets in | ||
297 | * @boundary: set this non-zero if the referred-to block is likely to be | ||
298 | * followed (on disk) by an indirect block. | ||
299 | * | ||
300 | * To store the locations of file's data ext4 uses a data structure common | ||
301 | * for UNIX filesystems - tree of pointers anchored in the inode, with | ||
302 | * data blocks at leaves and indirect blocks in intermediate nodes. | ||
303 | * This function translates the block number into path in that tree - | ||
304 | * return value is the path length and @offsets[n] is the offset of | ||
305 | * pointer to (n+1)th node in the nth one. If @block is out of range | ||
306 | * (negative or too large) warning is printed and zero returned. | ||
307 | * | ||
308 | * Note: function doesn't find node addresses, so no IO is needed. All | ||
309 | * we need to know is the capacity of indirect blocks (taken from the | ||
310 | * inode->i_sb). | ||
311 | */ | ||
312 | |||
313 | /* | ||
314 | * Portability note: the last comparison (check that we fit into triple | ||
315 | * indirect block) is spelled differently, because otherwise on an | ||
316 | * architecture with 32-bit longs and 8Kb pages we might get into trouble | ||
317 | * if our filesystem had 8Kb blocks. We might use long long, but that would | ||
318 | * kill us on x86. Oh, well, at least the sign propagation does not matter - | ||
319 | * i_block would have to be negative in the very beginning, so we would not | ||
320 | * get there at all. | ||
321 | */ | ||
322 | |||
323 | static int ext4_block_to_path(struct inode *inode, | ||
324 | ext4_lblk_t i_block, | ||
325 | ext4_lblk_t offsets[4], int *boundary) | ||
326 | { | ||
327 | int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
328 | int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb); | ||
329 | const long direct_blocks = EXT4_NDIR_BLOCKS, | ||
330 | indirect_blocks = ptrs, | ||
331 | double_blocks = (1 << (ptrs_bits * 2)); | ||
332 | int n = 0; | ||
333 | int final = 0; | ||
334 | |||
335 | if (i_block < direct_blocks) { | ||
336 | offsets[n++] = i_block; | ||
337 | final = direct_blocks; | ||
338 | } else if ((i_block -= direct_blocks) < indirect_blocks) { | ||
339 | offsets[n++] = EXT4_IND_BLOCK; | ||
340 | offsets[n++] = i_block; | ||
341 | final = ptrs; | ||
342 | } else if ((i_block -= indirect_blocks) < double_blocks) { | ||
343 | offsets[n++] = EXT4_DIND_BLOCK; | ||
344 | offsets[n++] = i_block >> ptrs_bits; | ||
345 | offsets[n++] = i_block & (ptrs - 1); | ||
346 | final = ptrs; | ||
347 | } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { | ||
348 | offsets[n++] = EXT4_TIND_BLOCK; | ||
349 | offsets[n++] = i_block >> (ptrs_bits * 2); | ||
350 | offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); | ||
351 | offsets[n++] = i_block & (ptrs - 1); | ||
352 | final = ptrs; | ||
353 | } else { | ||
354 | ext4_warning(inode->i_sb, "block %lu > max in inode %lu", | ||
355 | i_block + direct_blocks + | ||
356 | indirect_blocks + double_blocks, inode->i_ino); | ||
357 | } | ||
358 | if (boundary) | ||
359 | *boundary = final - 1 - (i_block & (ptrs - 1)); | ||
360 | return n; | ||
361 | } | ||
362 | |||
363 | static int __ext4_check_blockref(const char *function, unsigned int line, | ||
364 | struct inode *inode, | ||
365 | __le32 *p, unsigned int max) | ||
366 | { | ||
367 | struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; | ||
368 | __le32 *bref = p; | ||
369 | unsigned int blk; | ||
370 | |||
371 | while (bref < p+max) { | ||
372 | blk = le32_to_cpu(*bref++); | ||
373 | if (blk && | ||
374 | unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
375 | blk, 1))) { | ||
376 | es->s_last_error_block = cpu_to_le64(blk); | ||
377 | ext4_error_inode(inode, function, line, blk, | ||
378 | "invalid block"); | ||
379 | return -EIO; | ||
380 | } | ||
381 | } | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | |||
386 | #define ext4_check_indirect_blockref(inode, bh) \ | ||
387 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
388 | (__le32 *)(bh)->b_data, \ | ||
389 | EXT4_ADDR_PER_BLOCK((inode)->i_sb)) | ||
390 | |||
391 | #define ext4_check_inode_blockref(inode) \ | ||
392 | __ext4_check_blockref(__func__, __LINE__, inode, \ | ||
393 | EXT4_I(inode)->i_data, \ | ||
394 | EXT4_NDIR_BLOCKS) | ||
395 | |||
396 | /** | ||
397 | * ext4_get_branch - read the chain of indirect blocks leading to data | ||
398 | * @inode: inode in question | ||
399 | * @depth: depth of the chain (1 - direct pointer, etc.) | ||
400 | * @offsets: offsets of pointers in inode/indirect blocks | ||
401 | * @chain: place to store the result | ||
402 | * @err: here we store the error value | ||
403 | * | ||
404 | * Function fills the array of triples <key, p, bh> and returns %NULL | ||
405 | * if everything went OK or the pointer to the last filled triple | ||
406 | * (incomplete one) otherwise. Upon the return chain[i].key contains | ||
407 | * the number of (i+1)-th block in the chain (as it is stored in memory, | ||
408 | * i.e. little-endian 32-bit), chain[i].p contains the address of that | ||
409 | * number (it points into struct inode for i==0 and into the bh->b_data | ||
410 | * for i>0) and chain[i].bh points to the buffer_head of i-th indirect | ||
411 | * block for i>0 and NULL for i==0. In other words, it holds the block | ||
412 | * numbers of the chain, addresses they were taken from (and where we can | ||
413 | * verify that chain did not change) and buffer_heads hosting these | ||
414 | * numbers. | ||
415 | * | ||
416 | * Function stops when it stumbles upon zero pointer (absent block) | ||
417 | * (pointer to last triple returned, *@err == 0) | ||
418 | * or when it gets an IO error reading an indirect block | ||
419 | * (ditto, *@err == -EIO) | ||
420 | * or when it reads all @depth-1 indirect blocks successfully and finds | ||
421 | * the whole chain, all way to the data (returns %NULL, *err == 0). | ||
422 | * | ||
423 | * Need to be called with | ||
424 | * down_read(&EXT4_I(inode)->i_data_sem) | ||
425 | */ | ||
426 | static Indirect *ext4_get_branch(struct inode *inode, int depth, | ||
427 | ext4_lblk_t *offsets, | ||
428 | Indirect chain[4], int *err) | ||
429 | { | ||
430 | struct super_block *sb = inode->i_sb; | ||
431 | Indirect *p = chain; | ||
432 | struct buffer_head *bh; | ||
433 | |||
434 | *err = 0; | ||
435 | /* i_data is not going away, no lock needed */ | ||
436 | add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); | ||
437 | if (!p->key) | ||
438 | goto no_block; | ||
439 | while (--depth) { | ||
440 | bh = sb_getblk(sb, le32_to_cpu(p->key)); | ||
441 | if (unlikely(!bh)) | ||
442 | goto failure; | ||
443 | |||
444 | if (!bh_uptodate_or_lock(bh)) { | ||
445 | if (bh_submit_read(bh) < 0) { | ||
446 | put_bh(bh); | ||
447 | goto failure; | ||
448 | } | ||
449 | /* validate block references */ | ||
450 | if (ext4_check_indirect_blockref(inode, bh)) { | ||
451 | put_bh(bh); | ||
452 | goto failure; | ||
453 | } | ||
454 | } | ||
455 | |||
456 | add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); | ||
457 | /* Reader: end */ | ||
458 | if (!p->key) | ||
459 | goto no_block; | ||
460 | } | ||
461 | return NULL; | ||
462 | |||
463 | failure: | ||
464 | *err = -EIO; | ||
465 | no_block: | ||
466 | return p; | ||
467 | } | ||
468 | |||
469 | /** | ||
470 | * ext4_find_near - find a place for allocation with sufficient locality | ||
471 | * @inode: owner | ||
472 | * @ind: descriptor of indirect block. | ||
473 | * | ||
474 | * This function returns the preferred place for block allocation. | ||
475 | * It is used when heuristic for sequential allocation fails. | ||
476 | * Rules are: | ||
477 | * + if there is a block to the left of our position - allocate near it. | ||
478 | * + if pointer will live in indirect block - allocate near that block. | ||
479 | * + if pointer will live in inode - allocate in the same | ||
480 | * cylinder group. | ||
481 | * | ||
482 | * In the latter case we colour the starting block by the callers PID to | ||
483 | * prevent it from clashing with concurrent allocations for a different inode | ||
484 | * in the same block group. The PID is used here so that functionally related | ||
485 | * files will be close-by on-disk. | ||
486 | * | ||
487 | * Caller must make sure that @ind is valid and will stay that way. | ||
488 | */ | ||
489 | static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) | ||
490 | { | ||
491 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
492 | __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; | ||
493 | __le32 *p; | ||
494 | ext4_fsblk_t bg_start; | ||
495 | ext4_fsblk_t last_block; | ||
496 | ext4_grpblk_t colour; | ||
497 | ext4_group_t block_group; | ||
498 | int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); | ||
499 | |||
500 | /* Try to find previous block */ | ||
501 | for (p = ind->p - 1; p >= start; p--) { | ||
502 | if (*p) | ||
503 | return le32_to_cpu(*p); | ||
504 | } | ||
505 | |||
506 | /* No such thing, so let's try location of indirect block */ | ||
507 | if (ind->bh) | ||
508 | return ind->bh->b_blocknr; | ||
509 | |||
510 | /* | ||
511 | * It is going to be referred to from the inode itself? OK, just put it | ||
512 | * into the same cylinder group then. | ||
513 | */ | ||
514 | block_group = ei->i_block_group; | ||
515 | if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { | ||
516 | block_group &= ~(flex_size-1); | ||
517 | if (S_ISREG(inode->i_mode)) | ||
518 | block_group++; | ||
519 | } | ||
520 | bg_start = ext4_group_first_block_no(inode->i_sb, block_group); | ||
521 | last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; | ||
522 | |||
523 | /* | ||
524 | * If we are doing delayed allocation, we don't need take | ||
525 | * colour into account. | ||
526 | */ | ||
527 | if (test_opt(inode->i_sb, DELALLOC)) | ||
528 | return bg_start; | ||
529 | |||
530 | if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) | ||
531 | colour = (current->pid % 16) * | ||
532 | (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); | ||
533 | else | ||
534 | colour = (current->pid % 16) * ((last_block - bg_start) / 16); | ||
535 | return bg_start + colour; | ||
536 | } | ||
537 | |||
538 | /** | ||
539 | * ext4_find_goal - find a preferred place for allocation. | ||
540 | * @inode: owner | ||
541 | * @block: block we want | ||
542 | * @partial: pointer to the last triple within a chain | ||
543 | * | ||
544 | * Normally this function find the preferred place for block allocation, | ||
545 | * returns it. | ||
546 | * Because this is only used for non-extent files, we limit the block nr | ||
547 | * to 32 bits. | ||
548 | */ | ||
549 | static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, | ||
550 | Indirect *partial) | ||
551 | { | ||
552 | ext4_fsblk_t goal; | ||
553 | |||
554 | /* | ||
555 | * XXX need to get goal block from mballoc's data structures | ||
556 | */ | ||
557 | |||
558 | goal = ext4_find_near(inode, partial); | ||
559 | goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; | ||
560 | return goal; | ||
561 | } | ||
562 | |||
563 | /** | ||
564 | * ext4_blks_to_allocate - Look up the block map and count the number | ||
565 | * of direct blocks need to be allocated for the given branch. | ||
566 | * | ||
567 | * @branch: chain of indirect blocks | ||
568 | * @k: number of blocks need for indirect blocks | ||
569 | * @blks: number of data blocks to be mapped. | ||
570 | * @blocks_to_boundary: the offset in the indirect block | ||
571 | * | ||
572 | * return the total number of blocks to be allocate, including the | ||
573 | * direct and indirect blocks. | ||
574 | */ | ||
575 | static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, | ||
576 | int blocks_to_boundary) | ||
577 | { | ||
578 | unsigned int count = 0; | ||
579 | |||
580 | /* | ||
581 | * Simple case, [t,d]Indirect block(s) has not allocated yet | ||
582 | * then it's clear blocks on that path have not allocated | ||
583 | */ | ||
584 | if (k > 0) { | ||
585 | /* right now we don't handle cross boundary allocation */ | ||
586 | if (blks < blocks_to_boundary + 1) | ||
587 | count += blks; | ||
588 | else | ||
589 | count += blocks_to_boundary + 1; | ||
590 | return count; | ||
591 | } | ||
592 | |||
593 | count++; | ||
594 | while (count < blks && count <= blocks_to_boundary && | ||
595 | le32_to_cpu(*(branch[0].p + count)) == 0) { | ||
596 | count++; | ||
597 | } | ||
598 | return count; | ||
599 | } | ||
600 | |||
601 | /** | ||
602 | * ext4_alloc_blocks: multiple allocate blocks needed for a branch | ||
603 | * @handle: handle for this transaction | ||
604 | * @inode: inode which needs allocated blocks | ||
605 | * @iblock: the logical block to start allocated at | ||
606 | * @goal: preferred physical block of allocation | ||
607 | * @indirect_blks: the number of blocks need to allocate for indirect | ||
608 | * blocks | ||
609 | * @blks: number of desired blocks | ||
610 | * @new_blocks: on return it will store the new block numbers for | ||
611 | * the indirect blocks(if needed) and the first direct block, | ||
612 | * @err: on return it will store the error code | ||
613 | * | ||
614 | * This function will return the number of blocks allocated as | ||
615 | * requested by the passed-in parameters. | ||
616 | */ | ||
617 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | ||
618 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
619 | int indirect_blks, int blks, | ||
620 | ext4_fsblk_t new_blocks[4], int *err) | ||
621 | { | ||
622 | struct ext4_allocation_request ar; | ||
623 | int target, i; | ||
624 | unsigned long count = 0, blk_allocated = 0; | ||
625 | int index = 0; | ||
626 | ext4_fsblk_t current_block = 0; | ||
627 | int ret = 0; | ||
628 | |||
629 | /* | ||
630 | * Here we try to allocate the requested multiple blocks at once, | ||
631 | * on a best-effort basis. | ||
632 | * To build a branch, we should allocate blocks for | ||
633 | * the indirect blocks(if not allocated yet), and at least | ||
634 | * the first direct block of this branch. That's the | ||
635 | * minimum number of blocks need to allocate(required) | ||
636 | */ | ||
637 | /* first we try to allocate the indirect blocks */ | ||
638 | target = indirect_blks; | ||
639 | while (target > 0) { | ||
640 | count = target; | ||
641 | /* allocating blocks for indirect blocks and direct blocks */ | ||
642 | current_block = ext4_new_meta_blocks(handle, inode, goal, | ||
643 | 0, &count, err); | ||
644 | if (*err) | ||
645 | goto failed_out; | ||
646 | |||
647 | if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
648 | EXT4_ERROR_INODE(inode, | ||
649 | "current_block %llu + count %lu > %d!", | ||
650 | current_block, count, | ||
651 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
652 | *err = -EIO; | ||
653 | goto failed_out; | ||
654 | } | ||
655 | |||
656 | target -= count; | ||
657 | /* allocate blocks for indirect blocks */ | ||
658 | while (index < indirect_blks && count) { | ||
659 | new_blocks[index++] = current_block++; | ||
660 | count--; | ||
661 | } | ||
662 | if (count > 0) { | ||
663 | /* | ||
664 | * save the new block number | ||
665 | * for the first direct block | ||
666 | */ | ||
667 | new_blocks[index] = current_block; | ||
668 | printk(KERN_INFO "%s returned more blocks than " | ||
669 | "requested\n", __func__); | ||
670 | WARN_ON(1); | ||
671 | break; | ||
672 | } | ||
673 | } | ||
674 | |||
675 | target = blks - count ; | ||
676 | blk_allocated = count; | ||
677 | if (!target) | ||
678 | goto allocated; | ||
679 | /* Now allocate data blocks */ | ||
680 | memset(&ar, 0, sizeof(ar)); | ||
681 | ar.inode = inode; | ||
682 | ar.goal = goal; | ||
683 | ar.len = target; | ||
684 | ar.logical = iblock; | ||
685 | if (S_ISREG(inode->i_mode)) | ||
686 | /* enable in-core preallocation only for regular files */ | ||
687 | ar.flags = EXT4_MB_HINT_DATA; | ||
688 | |||
689 | current_block = ext4_mb_new_blocks(handle, &ar, err); | ||
690 | if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) { | ||
691 | EXT4_ERROR_INODE(inode, | ||
692 | "current_block %llu + ar.len %d > %d!", | ||
693 | current_block, ar.len, | ||
694 | EXT4_MAX_BLOCK_FILE_PHYS); | ||
695 | *err = -EIO; | ||
696 | goto failed_out; | ||
697 | } | ||
698 | |||
699 | if (*err && (target == blks)) { | ||
700 | /* | ||
701 | * if the allocation failed and we didn't allocate | ||
702 | * any blocks before | ||
703 | */ | ||
704 | goto failed_out; | ||
705 | } | ||
706 | if (!*err) { | ||
707 | if (target == blks) { | ||
708 | /* | ||
709 | * save the new block number | ||
710 | * for the first direct block | ||
711 | */ | ||
712 | new_blocks[index] = current_block; | ||
713 | } | ||
714 | blk_allocated += ar.len; | ||
715 | } | ||
716 | allocated: | ||
717 | /* total number of blocks allocated for direct blocks */ | ||
718 | ret = blk_allocated; | ||
719 | *err = 0; | ||
720 | return ret; | ||
721 | failed_out: | ||
722 | for (i = 0; i < index; i++) | ||
723 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
724 | return ret; | ||
725 | } | ||
726 | |||
727 | /** | ||
728 | * ext4_alloc_branch - allocate and set up a chain of blocks. | ||
729 | * @handle: handle for this transaction | ||
730 | * @inode: owner | ||
731 | * @indirect_blks: number of allocated indirect blocks | ||
732 | * @blks: number of allocated direct blocks | ||
733 | * @goal: preferred place for allocation | ||
734 | * @offsets: offsets (in the blocks) to store the pointers to next. | ||
735 | * @branch: place to store the chain in. | ||
736 | * | ||
737 | * This function allocates blocks, zeroes out all but the last one, | ||
738 | * links them into chain and (if we are synchronous) writes them to disk. | ||
739 | * In other words, it prepares a branch that can be spliced onto the | ||
740 | * inode. It stores the information about that chain in the branch[], in | ||
741 | * the same format as ext4_get_branch() would do. We are calling it after | ||
742 | * we had read the existing part of chain and partial points to the last | ||
743 | * triple of that (one with zero ->key). Upon the exit we have the same | ||
744 | * picture as after the successful ext4_get_block(), except that in one | ||
745 | * place chain is disconnected - *branch->p is still zero (we did not | ||
746 | * set the last link), but branch->key contains the number that should | ||
747 | * be placed into *branch->p to fill that gap. | ||
748 | * | ||
749 | * If allocation fails we free all blocks we've allocated (and forget | ||
750 | * their buffer_heads) and return the error value the from failed | ||
751 | * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain | ||
752 | * as described above and return 0. | ||
753 | */ | ||
754 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | ||
755 | ext4_lblk_t iblock, int indirect_blks, | ||
756 | int *blks, ext4_fsblk_t goal, | ||
757 | ext4_lblk_t *offsets, Indirect *branch) | ||
758 | { | ||
759 | int blocksize = inode->i_sb->s_blocksize; | ||
760 | int i, n = 0; | ||
761 | int err = 0; | ||
762 | struct buffer_head *bh; | ||
763 | int num; | ||
764 | ext4_fsblk_t new_blocks[4]; | ||
765 | ext4_fsblk_t current_block; | ||
766 | |||
767 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | ||
768 | *blks, new_blocks, &err); | ||
769 | if (err) | ||
770 | return err; | ||
771 | |||
772 | branch[0].key = cpu_to_le32(new_blocks[0]); | ||
773 | /* | ||
774 | * metadata blocks and data blocks are allocated. | ||
775 | */ | ||
776 | for (n = 1; n <= indirect_blks; n++) { | ||
777 | /* | ||
778 | * Get buffer_head for parent block, zero it out | ||
779 | * and set the pointer to new one, then send | ||
780 | * parent to disk. | ||
781 | */ | ||
782 | bh = sb_getblk(inode->i_sb, new_blocks[n-1]); | ||
783 | if (unlikely(!bh)) { | ||
784 | err = -EIO; | ||
785 | goto failed; | ||
786 | } | ||
787 | |||
788 | branch[n].bh = bh; | ||
789 | lock_buffer(bh); | ||
790 | BUFFER_TRACE(bh, "call get_create_access"); | ||
791 | err = ext4_journal_get_create_access(handle, bh); | ||
792 | if (err) { | ||
793 | /* Don't brelse(bh) here; it's done in | ||
794 | * ext4_journal_forget() below */ | ||
795 | unlock_buffer(bh); | ||
796 | goto failed; | ||
797 | } | ||
798 | |||
799 | memset(bh->b_data, 0, blocksize); | ||
800 | branch[n].p = (__le32 *) bh->b_data + offsets[n]; | ||
801 | branch[n].key = cpu_to_le32(new_blocks[n]); | ||
802 | *branch[n].p = branch[n].key; | ||
803 | if (n == indirect_blks) { | ||
804 | current_block = new_blocks[n]; | ||
805 | /* | ||
806 | * End of chain, update the last new metablock of | ||
807 | * the chain to point to the new allocated | ||
808 | * data blocks numbers | ||
809 | */ | ||
810 | for (i = 1; i < num; i++) | ||
811 | *(branch[n].p + i) = cpu_to_le32(++current_block); | ||
812 | } | ||
813 | BUFFER_TRACE(bh, "marking uptodate"); | ||
814 | set_buffer_uptodate(bh); | ||
815 | unlock_buffer(bh); | ||
816 | |||
817 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
818 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
819 | if (err) | ||
820 | goto failed; | ||
821 | } | ||
822 | *blks = num; | ||
823 | return err; | ||
824 | failed: | ||
825 | /* Allocation failed, free what we already allocated */ | ||
826 | ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0); | ||
827 | for (i = 1; i <= n ; i++) { | ||
828 | /* | ||
829 | * branch[i].bh is newly allocated, so there is no | ||
830 | * need to revoke the block, which is why we don't | ||
831 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
832 | */ | ||
833 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, | ||
834 | EXT4_FREE_BLOCKS_FORGET); | ||
835 | } | ||
836 | for (i = n+1; i < indirect_blks; i++) | ||
837 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0); | ||
838 | |||
839 | ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0); | ||
840 | |||
841 | return err; | ||
842 | } | ||
843 | |||
844 | /** | ||
845 | * ext4_splice_branch - splice the allocated branch onto inode. | ||
846 | * @handle: handle for this transaction | ||
847 | * @inode: owner | ||
848 | * @block: (logical) number of block we are adding | ||
849 | * @chain: chain of indirect blocks (with a missing link - see | ||
850 | * ext4_alloc_branch) | ||
851 | * @where: location of missing link | ||
852 | * @num: number of indirect blocks we are adding | ||
853 | * @blks: number of direct blocks we are adding | ||
854 | * | ||
855 | * This function fills the missing link and does all housekeeping needed in | ||
856 | * inode (->i_blocks, etc.). In case of success we end up with the full | ||
857 | * chain to new block and return 0. | ||
858 | */ | ||
859 | static int ext4_splice_branch(handle_t *handle, struct inode *inode, | ||
860 | ext4_lblk_t block, Indirect *where, int num, | ||
861 | int blks) | ||
862 | { | ||
863 | int i; | ||
864 | int err = 0; | ||
865 | ext4_fsblk_t current_block; | ||
866 | |||
867 | /* | ||
868 | * If we're splicing into a [td]indirect block (as opposed to the | ||
869 | * inode) then we need to get write access to the [td]indirect block | ||
870 | * before the splice. | ||
871 | */ | ||
872 | if (where->bh) { | ||
873 | BUFFER_TRACE(where->bh, "get_write_access"); | ||
874 | err = ext4_journal_get_write_access(handle, where->bh); | ||
875 | if (err) | ||
876 | goto err_out; | ||
877 | } | ||
878 | /* That's it */ | ||
879 | |||
880 | *where->p = where->key; | ||
881 | |||
882 | /* | ||
883 | * Update the host buffer_head or inode to point to more just allocated | ||
884 | * direct blocks blocks | ||
885 | */ | ||
886 | if (num == 0 && blks > 1) { | ||
887 | current_block = le32_to_cpu(where->key) + 1; | ||
888 | for (i = 1; i < blks; i++) | ||
889 | *(where->p + i) = cpu_to_le32(current_block++); | ||
890 | } | ||
891 | |||
892 | /* We are done with atomic stuff, now do the rest of housekeeping */ | ||
893 | /* had we spliced it onto indirect block? */ | ||
894 | if (where->bh) { | ||
895 | /* | ||
896 | * If we spliced it onto an indirect block, we haven't | ||
897 | * altered the inode. Note however that if it is being spliced | ||
898 | * onto an indirect block at the very end of the file (the | ||
899 | * file is growing) then we *will* alter the inode to reflect | ||
900 | * the new i_size. But that is not done here - it is done in | ||
901 | * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. | ||
902 | */ | ||
903 | jbd_debug(5, "splicing indirect only\n"); | ||
904 | BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); | ||
905 | err = ext4_handle_dirty_metadata(handle, inode, where->bh); | ||
906 | if (err) | ||
907 | goto err_out; | ||
908 | } else { | ||
909 | /* | ||
910 | * OK, we spliced it into the inode itself on a direct block. | ||
911 | */ | ||
912 | ext4_mark_inode_dirty(handle, inode); | ||
913 | jbd_debug(5, "splicing direct\n"); | ||
914 | } | ||
915 | return err; | ||
916 | |||
917 | err_out: | ||
918 | for (i = 1; i <= num; i++) { | ||
919 | /* | ||
920 | * branch[i].bh is newly allocated, so there is no | ||
921 | * need to revoke the block, which is why we don't | ||
922 | * need to set EXT4_FREE_BLOCKS_METADATA. | ||
923 | */ | ||
924 | ext4_free_blocks(handle, inode, where[i].bh, 0, 1, | ||
925 | EXT4_FREE_BLOCKS_FORGET); | ||
926 | } | ||
927 | ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), | ||
928 | blks, 0); | ||
929 | |||
930 | return err; | ||
931 | } | ||
932 | |||
933 | /* | ||
934 | * The ext4_ind_map_blocks() function handles non-extents inodes | ||
935 | * (i.e., using the traditional indirect/double-indirect i_blocks | ||
936 | * scheme) for ext4_map_blocks(). | ||
937 | * | ||
938 | * Allocation strategy is simple: if we have to allocate something, we will | ||
939 | * have to go the whole way to leaf. So let's do it before attaching anything | ||
940 | * to tree, set linkage between the newborn blocks, write them if sync is | ||
941 | * required, recheck the path, free and repeat if check fails, otherwise | ||
942 | * set the last missing link (that will protect us from any truncate-generated | ||
943 | * removals - all blocks on the path are immune now) and possibly force the | ||
944 | * write on the parent block. | ||
945 | * That has a nice additional property: no special recovery from the failed | ||
946 | * allocations is needed - we simply release blocks and do not touch anything | ||
947 | * reachable from inode. | ||
948 | * | ||
949 | * `handle' can be NULL if create == 0. | ||
950 | * | ||
951 | * return > 0, # of blocks mapped or allocated. | ||
952 | * return = 0, if plain lookup failed. | ||
953 | * return < 0, error case. | ||
954 | * | ||
955 | * The ext4_ind_get_blocks() function should be called with | ||
956 | * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem | ||
957 | * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or | ||
958 | * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system | ||
959 | * blocks. | ||
960 | */ | ||
961 | static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, | ||
962 | struct ext4_map_blocks *map, | ||
963 | int flags) | ||
964 | { | ||
965 | int err = -EIO; | ||
966 | ext4_lblk_t offsets[4]; | ||
967 | Indirect chain[4]; | ||
968 | Indirect *partial; | ||
969 | ext4_fsblk_t goal; | ||
970 | int indirect_blks; | ||
971 | int blocks_to_boundary = 0; | ||
972 | int depth; | ||
973 | int count = 0; | ||
974 | ext4_fsblk_t first_block = 0; | ||
975 | |||
976 | trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); | ||
977 | J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); | ||
978 | J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); | ||
979 | depth = ext4_block_to_path(inode, map->m_lblk, offsets, | ||
980 | &blocks_to_boundary); | ||
981 | |||
982 | if (depth == 0) | ||
983 | goto out; | ||
984 | |||
985 | partial = ext4_get_branch(inode, depth, offsets, chain, &err); | ||
986 | |||
987 | /* Simplest case - block found, no allocation needed */ | ||
988 | if (!partial) { | ||
989 | first_block = le32_to_cpu(chain[depth - 1].key); | ||
990 | count++; | ||
991 | /*map more blocks*/ | ||
992 | while (count < map->m_len && count <= blocks_to_boundary) { | ||
993 | ext4_fsblk_t blk; | ||
994 | |||
995 | blk = le32_to_cpu(*(chain[depth-1].p + count)); | ||
996 | |||
997 | if (blk == first_block + count) | ||
998 | count++; | ||
999 | else | ||
1000 | break; | ||
1001 | } | ||
1002 | goto got_it; | ||
1003 | } | ||
1004 | |||
1005 | /* Next simple case - plain lookup or failed read of indirect block */ | ||
1006 | if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) | ||
1007 | goto cleanup; | ||
1008 | |||
1009 | /* | ||
1010 | * Okay, we need to do block allocation. | ||
1011 | */ | ||
1012 | goal = ext4_find_goal(inode, map->m_lblk, partial); | ||
1013 | |||
1014 | /* the number of blocks need to allocate for [d,t]indirect blocks */ | ||
1015 | indirect_blks = (chain + depth) - partial - 1; | ||
1016 | |||
1017 | /* | ||
1018 | * Next look up the indirect map to count the totoal number of | ||
1019 | * direct blocks to allocate for this branch. | ||
1020 | */ | ||
1021 | count = ext4_blks_to_allocate(partial, indirect_blks, | ||
1022 | map->m_len, blocks_to_boundary); | ||
1023 | /* | ||
1024 | * Block out ext4_truncate while we alter the tree | ||
1025 | */ | ||
1026 | err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, | ||
1027 | &count, goal, | ||
1028 | offsets + (partial - chain), partial); | ||
1029 | |||
1030 | /* | ||
1031 | * The ext4_splice_branch call will free and forget any buffers | ||
1032 | * on the new chain if there is a failure, but that risks using | ||
1033 | * up transaction credits, especially for bitmaps where the | ||
1034 | * credits cannot be returned. Can we handle this somehow? We | ||
1035 | * may need to return -EAGAIN upwards in the worst case. --sct | ||
1036 | */ | ||
1037 | if (!err) | ||
1038 | err = ext4_splice_branch(handle, inode, map->m_lblk, | ||
1039 | partial, indirect_blks, count); | ||
1040 | if (err) | ||
1041 | goto cleanup; | ||
1042 | |||
1043 | map->m_flags |= EXT4_MAP_NEW; | ||
1044 | |||
1045 | ext4_update_inode_fsync_trans(handle, inode, 1); | ||
1046 | got_it: | ||
1047 | map->m_flags |= EXT4_MAP_MAPPED; | ||
1048 | map->m_pblk = le32_to_cpu(chain[depth-1].key); | ||
1049 | map->m_len = count; | ||
1050 | if (count > blocks_to_boundary) | ||
1051 | map->m_flags |= EXT4_MAP_BOUNDARY; | ||
1052 | err = count; | ||
1053 | /* Clean up and exit */ | ||
1054 | partial = chain + depth - 1; /* the whole chain */ | ||
1055 | cleanup: | ||
1056 | while (partial > chain) { | ||
1057 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
1058 | brelse(partial->bh); | ||
1059 | partial--; | ||
1060 | } | ||
1061 | out: | ||
1062 | trace_ext4_ind_map_blocks_exit(inode, map->m_lblk, | ||
1063 | map->m_pblk, map->m_len, err); | ||
1064 | return err; | ||
1065 | } | ||
1066 | |||
1067 | #ifdef CONFIG_QUOTA | 238 | #ifdef CONFIG_QUOTA |
1068 | qsize_t *ext4_get_reserved_space(struct inode *inode) | 239 | qsize_t *ext4_get_reserved_space(struct inode *inode) |
1069 | { | 240 | { |
@@ -1073,33 +244,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode) | |||
1073 | 244 | ||
1074 | /* | 245 | /* |
1075 | * Calculate the number of metadata blocks need to reserve | 246 | * Calculate the number of metadata blocks need to reserve |
1076 | * to allocate a new block at @lblocks for non extent file based file | ||
1077 | */ | ||
1078 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, | ||
1079 | sector_t lblock) | ||
1080 | { | ||
1081 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
1082 | sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1); | ||
1083 | int blk_bits; | ||
1084 | |||
1085 | if (lblock < EXT4_NDIR_BLOCKS) | ||
1086 | return 0; | ||
1087 | |||
1088 | lblock -= EXT4_NDIR_BLOCKS; | ||
1089 | |||
1090 | if (ei->i_da_metadata_calc_len && | ||
1091 | (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { | ||
1092 | ei->i_da_metadata_calc_len++; | ||
1093 | return 0; | ||
1094 | } | ||
1095 | ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; | ||
1096 | ei->i_da_metadata_calc_len = 1; | ||
1097 | blk_bits = order_base_2(lblock); | ||
1098 | return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; | ||
1099 | } | ||
1100 | |||
1101 | /* | ||
1102 | * Calculate the number of metadata blocks need to reserve | ||
1103 | * to allocate a block located at @lblock | 247 | * to allocate a block located at @lblock |
1104 | */ | 248 | */ |
1105 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | 249 | static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) |
@@ -1107,7 +251,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) | |||
1107 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) | 251 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
1108 | return ext4_ext_calc_metadata_amount(inode, lblock); | 252 | return ext4_ext_calc_metadata_amount(inode, lblock); |
1109 | 253 | ||
1110 | return ext4_indirect_calc_metadata_amount(inode, lblock); | 254 | return ext4_ind_calc_metadata_amount(inode, lblock); |
1111 | } | 255 | } |
1112 | 256 | ||
1113 | /* | 257 | /* |
@@ -1589,16 +733,6 @@ static int do_journal_get_write_access(handle_t *handle, | |||
1589 | return ret; | 733 | return ret; |
1590 | } | 734 | } |
1591 | 735 | ||
1592 | /* | ||
1593 | * Truncate blocks that were not used by write. We have to truncate the | ||
1594 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
1595 | */ | ||
1596 | static void ext4_truncate_failed_write(struct inode *inode) | ||
1597 | { | ||
1598 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
1599 | ext4_truncate(inode); | ||
1600 | } | ||
1601 | |||
1602 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, | 736 | static int ext4_get_block_write(struct inode *inode, sector_t iblock, |
1603 | struct buffer_head *bh_result, int create); | 737 | struct buffer_head *bh_result, int create); |
1604 | static int ext4_write_begin(struct file *file, struct address_space *mapping, | 738 | static int ext4_write_begin(struct file *file, struct address_space *mapping, |
@@ -1863,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file, | |||
1863 | if (new_i_size > inode->i_size) | 997 | if (new_i_size > inode->i_size) |
1864 | i_size_write(inode, pos+copied); | 998 | i_size_write(inode, pos+copied); |
1865 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); | 999 | ext4_set_inode_state(inode, EXT4_STATE_JDATA); |
1000 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
1866 | if (new_i_size > EXT4_I(inode)->i_disksize) { | 1001 | if (new_i_size > EXT4_I(inode)->i_disksize) { |
1867 | ext4_update_i_disksize(inode, new_i_size); | 1002 | ext4_update_i_disksize(inode, new_i_size); |
1868 | ret2 = ext4_mark_inode_dirty(handle, inode); | 1003 | ret2 = ext4_mark_inode_dirty(handle, inode); |
@@ -2571,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page, | |||
2571 | write_end_fn); | 1706 | write_end_fn); |
2572 | if (ret == 0) | 1707 | if (ret == 0) |
2573 | ret = err; | 1708 | ret = err; |
1709 | EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; | ||
2574 | err = ext4_journal_stop(handle); | 1710 | err = ext4_journal_stop(handle); |
2575 | if (!ret) | 1711 | if (!ret) |
2576 | ret = err; | 1712 | ret = err; |
@@ -3450,112 +2586,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait) | |||
3450 | } | 2586 | } |
3451 | 2587 | ||
3452 | /* | 2588 | /* |
3453 | * O_DIRECT for ext3 (or indirect map) based files | ||
3454 | * | ||
3455 | * If the O_DIRECT write will extend the file then add this inode to the | ||
3456 | * orphan list. So recovery will truncate it back to the original size | ||
3457 | * if the machine crashes during the write. | ||
3458 | * | ||
3459 | * If the O_DIRECT write is intantiating holes inside i_size and the machine | ||
3460 | * crashes then stale disk data _may_ be exposed inside the file. But current | ||
3461 | * VFS code falls back into buffered path in that case so we are safe. | ||
3462 | */ | ||
3463 | static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | ||
3464 | const struct iovec *iov, loff_t offset, | ||
3465 | unsigned long nr_segs) | ||
3466 | { | ||
3467 | struct file *file = iocb->ki_filp; | ||
3468 | struct inode *inode = file->f_mapping->host; | ||
3469 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
3470 | handle_t *handle; | ||
3471 | ssize_t ret; | ||
3472 | int orphan = 0; | ||
3473 | size_t count = iov_length(iov, nr_segs); | ||
3474 | int retries = 0; | ||
3475 | |||
3476 | if (rw == WRITE) { | ||
3477 | loff_t final_size = offset + count; | ||
3478 | |||
3479 | if (final_size > inode->i_size) { | ||
3480 | /* Credits for sb + inode write */ | ||
3481 | handle = ext4_journal_start(inode, 2); | ||
3482 | if (IS_ERR(handle)) { | ||
3483 | ret = PTR_ERR(handle); | ||
3484 | goto out; | ||
3485 | } | ||
3486 | ret = ext4_orphan_add(handle, inode); | ||
3487 | if (ret) { | ||
3488 | ext4_journal_stop(handle); | ||
3489 | goto out; | ||
3490 | } | ||
3491 | orphan = 1; | ||
3492 | ei->i_disksize = inode->i_size; | ||
3493 | ext4_journal_stop(handle); | ||
3494 | } | ||
3495 | } | ||
3496 | |||
3497 | retry: | ||
3498 | if (rw == READ && ext4_should_dioread_nolock(inode)) | ||
3499 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
3500 | inode->i_sb->s_bdev, iov, | ||
3501 | offset, nr_segs, | ||
3502 | ext4_get_block, NULL, NULL, 0); | ||
3503 | else { | ||
3504 | ret = blockdev_direct_IO(rw, iocb, inode, iov, | ||
3505 | offset, nr_segs, ext4_get_block); | ||
3506 | |||
3507 | if (unlikely((rw & WRITE) && ret < 0)) { | ||
3508 | loff_t isize = i_size_read(inode); | ||
3509 | loff_t end = offset + iov_length(iov, nr_segs); | ||
3510 | |||
3511 | if (end > isize) | ||
3512 | ext4_truncate_failed_write(inode); | ||
3513 | } | ||
3514 | } | ||
3515 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
3516 | goto retry; | ||
3517 | |||
3518 | if (orphan) { | ||
3519 | int err; | ||
3520 | |||
3521 | /* Credits for sb + inode write */ | ||
3522 | handle = ext4_journal_start(inode, 2); | ||
3523 | if (IS_ERR(handle)) { | ||
3524 | /* This is really bad luck. We've written the data | ||
3525 | * but cannot extend i_size. Bail out and pretend | ||
3526 | * the write failed... */ | ||
3527 | ret = PTR_ERR(handle); | ||
3528 | if (inode->i_nlink) | ||
3529 | ext4_orphan_del(NULL, inode); | ||
3530 | |||
3531 | goto out; | ||
3532 | } | ||
3533 | if (inode->i_nlink) | ||
3534 | ext4_orphan_del(handle, inode); | ||
3535 | if (ret > 0) { | ||
3536 | loff_t end = offset + ret; | ||
3537 | if (end > inode->i_size) { | ||
3538 | ei->i_disksize = end; | ||
3539 | i_size_write(inode, end); | ||
3540 | /* | ||
3541 | * We're going to return a positive `ret' | ||
3542 | * here due to non-zero-length I/O, so there's | ||
3543 | * no way of reporting error returns from | ||
3544 | * ext4_mark_inode_dirty() to userspace. So | ||
3545 | * ignore it. | ||
3546 | */ | ||
3547 | ext4_mark_inode_dirty(handle, inode); | ||
3548 | } | ||
3549 | } | ||
3550 | err = ext4_journal_stop(handle); | ||
3551 | if (ret == 0) | ||
3552 | ret = err; | ||
3553 | } | ||
3554 | out: | ||
3555 | return ret; | ||
3556 | } | ||
3557 | |||
3558 | /* | ||
3559 | * ext4_get_block used when preparing for a DIO write or buffer write. | 2589 | * ext4_get_block used when preparing for a DIO write or buffer write. |
3560 | * We allocate an uinitialized extent if blocks haven't been allocated. | 2590 | * We allocate an uinitialized extent if blocks haven't been allocated. |
3561 | * The extent will be converted to initialized after the IO is complete. | 2591 | * The extent will be converted to initialized after the IO is complete. |
@@ -4033,383 +3063,6 @@ unlock: | |||
4033 | return err; | 3063 | return err; |
4034 | } | 3064 | } |
4035 | 3065 | ||
4036 | /* | ||
4037 | * Probably it should be a library function... search for first non-zero word | ||
4038 | * or memcmp with zero_page, whatever is better for particular architecture. | ||
4039 | * Linus? | ||
4040 | */ | ||
4041 | static inline int all_zeroes(__le32 *p, __le32 *q) | ||
4042 | { | ||
4043 | while (p < q) | ||
4044 | if (*p++) | ||
4045 | return 0; | ||
4046 | return 1; | ||
4047 | } | ||
4048 | |||
4049 | /** | ||
4050 | * ext4_find_shared - find the indirect blocks for partial truncation. | ||
4051 | * @inode: inode in question | ||
4052 | * @depth: depth of the affected branch | ||
4053 | * @offsets: offsets of pointers in that branch (see ext4_block_to_path) | ||
4054 | * @chain: place to store the pointers to partial indirect blocks | ||
4055 | * @top: place to the (detached) top of branch | ||
4056 | * | ||
4057 | * This is a helper function used by ext4_truncate(). | ||
4058 | * | ||
4059 | * When we do truncate() we may have to clean the ends of several | ||
4060 | * indirect blocks but leave the blocks themselves alive. Block is | ||
4061 | * partially truncated if some data below the new i_size is referred | ||
4062 | * from it (and it is on the path to the first completely truncated | ||
4063 | * data block, indeed). We have to free the top of that path along | ||
4064 | * with everything to the right of the path. Since no allocation | ||
4065 | * past the truncation point is possible until ext4_truncate() | ||
4066 | * finishes, we may safely do the latter, but top of branch may | ||
4067 | * require special attention - pageout below the truncation point | ||
4068 | * might try to populate it. | ||
4069 | * | ||
4070 | * We atomically detach the top of branch from the tree, store the | ||
4071 | * block number of its root in *@top, pointers to buffer_heads of | ||
4072 | * partially truncated blocks - in @chain[].bh and pointers to | ||
4073 | * their last elements that should not be removed - in | ||
4074 | * @chain[].p. Return value is the pointer to last filled element | ||
4075 | * of @chain. | ||
4076 | * | ||
4077 | * The work left to caller to do the actual freeing of subtrees: | ||
4078 | * a) free the subtree starting from *@top | ||
4079 | * b) free the subtrees whose roots are stored in | ||
4080 | * (@chain[i].p+1 .. end of @chain[i].bh->b_data) | ||
4081 | * c) free the subtrees growing from the inode past the @chain[0]. | ||
4082 | * (no partially truncated stuff there). */ | ||
4083 | |||
4084 | static Indirect *ext4_find_shared(struct inode *inode, int depth, | ||
4085 | ext4_lblk_t offsets[4], Indirect chain[4], | ||
4086 | __le32 *top) | ||
4087 | { | ||
4088 | Indirect *partial, *p; | ||
4089 | int k, err; | ||
4090 | |||
4091 | *top = 0; | ||
4092 | /* Make k index the deepest non-null offset + 1 */ | ||
4093 | for (k = depth; k > 1 && !offsets[k-1]; k--) | ||
4094 | ; | ||
4095 | partial = ext4_get_branch(inode, k, offsets, chain, &err); | ||
4096 | /* Writer: pointers */ | ||
4097 | if (!partial) | ||
4098 | partial = chain + k-1; | ||
4099 | /* | ||
4100 | * If the branch acquired continuation since we've looked at it - | ||
4101 | * fine, it should all survive and (new) top doesn't belong to us. | ||
4102 | */ | ||
4103 | if (!partial->key && *partial->p) | ||
4104 | /* Writer: end */ | ||
4105 | goto no_top; | ||
4106 | for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) | ||
4107 | ; | ||
4108 | /* | ||
4109 | * OK, we've found the last block that must survive. The rest of our | ||
4110 | * branch should be detached before unlocking. However, if that rest | ||
4111 | * of branch is all ours and does not grow immediately from the inode | ||
4112 | * it's easier to cheat and just decrement partial->p. | ||
4113 | */ | ||
4114 | if (p == chain + k - 1 && p > chain) { | ||
4115 | p->p--; | ||
4116 | } else { | ||
4117 | *top = *p->p; | ||
4118 | /* Nope, don't do this in ext4. Must leave the tree intact */ | ||
4119 | #if 0 | ||
4120 | *p->p = 0; | ||
4121 | #endif | ||
4122 | } | ||
4123 | /* Writer: end */ | ||
4124 | |||
4125 | while (partial > p) { | ||
4126 | brelse(partial->bh); | ||
4127 | partial--; | ||
4128 | } | ||
4129 | no_top: | ||
4130 | return partial; | ||
4131 | } | ||
4132 | |||
4133 | /* | ||
4134 | * Zero a number of block pointers in either an inode or an indirect block. | ||
4135 | * If we restart the transaction we must again get write access to the | ||
4136 | * indirect block for further modification. | ||
4137 | * | ||
4138 | * We release `count' blocks on disk, but (last - first) may be greater | ||
4139 | * than `count' because there can be holes in there. | ||
4140 | * | ||
4141 | * Return 0 on success, 1 on invalid block range | ||
4142 | * and < 0 on fatal error. | ||
4143 | */ | ||
4144 | static int ext4_clear_blocks(handle_t *handle, struct inode *inode, | ||
4145 | struct buffer_head *bh, | ||
4146 | ext4_fsblk_t block_to_free, | ||
4147 | unsigned long count, __le32 *first, | ||
4148 | __le32 *last) | ||
4149 | { | ||
4150 | __le32 *p; | ||
4151 | int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; | ||
4152 | int err; | ||
4153 | |||
4154 | if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) | ||
4155 | flags |= EXT4_FREE_BLOCKS_METADATA; | ||
4156 | |||
4157 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, | ||
4158 | count)) { | ||
4159 | EXT4_ERROR_INODE(inode, "attempt to clear invalid " | ||
4160 | "blocks %llu len %lu", | ||
4161 | (unsigned long long) block_to_free, count); | ||
4162 | return 1; | ||
4163 | } | ||
4164 | |||
4165 | if (try_to_extend_transaction(handle, inode)) { | ||
4166 | if (bh) { | ||
4167 | BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); | ||
4168 | err = ext4_handle_dirty_metadata(handle, inode, bh); | ||
4169 | if (unlikely(err)) | ||
4170 | goto out_err; | ||
4171 | } | ||
4172 | err = ext4_mark_inode_dirty(handle, inode); | ||
4173 | if (unlikely(err)) | ||
4174 | goto out_err; | ||
4175 | err = ext4_truncate_restart_trans(handle, inode, | ||
4176 | blocks_for_truncate(inode)); | ||
4177 | if (unlikely(err)) | ||
4178 | goto out_err; | ||
4179 | if (bh) { | ||
4180 | BUFFER_TRACE(bh, "retaking write access"); | ||
4181 | err = ext4_journal_get_write_access(handle, bh); | ||
4182 | if (unlikely(err)) | ||
4183 | goto out_err; | ||
4184 | } | ||
4185 | } | ||
4186 | |||
4187 | for (p = first; p < last; p++) | ||
4188 | *p = 0; | ||
4189 | |||
4190 | ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags); | ||
4191 | return 0; | ||
4192 | out_err: | ||
4193 | ext4_std_error(inode->i_sb, err); | ||
4194 | return err; | ||
4195 | } | ||
4196 | |||
4197 | /** | ||
4198 | * ext4_free_data - free a list of data blocks | ||
4199 | * @handle: handle for this transaction | ||
4200 | * @inode: inode we are dealing with | ||
4201 | * @this_bh: indirect buffer_head which contains *@first and *@last | ||
4202 | * @first: array of block numbers | ||
4203 | * @last: points immediately past the end of array | ||
4204 | * | ||
4205 | * We are freeing all blocks referred from that array (numbers are stored as | ||
4206 | * little-endian 32-bit) and updating @inode->i_blocks appropriately. | ||
4207 | * | ||
4208 | * We accumulate contiguous runs of blocks to free. Conveniently, if these | ||
4209 | * blocks are contiguous then releasing them at one time will only affect one | ||
4210 | * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't | ||
4211 | * actually use a lot of journal space. | ||
4212 | * | ||
4213 | * @this_bh will be %NULL if @first and @last point into the inode's direct | ||
4214 | * block pointers. | ||
4215 | */ | ||
4216 | static void ext4_free_data(handle_t *handle, struct inode *inode, | ||
4217 | struct buffer_head *this_bh, | ||
4218 | __le32 *first, __le32 *last) | ||
4219 | { | ||
4220 | ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */ | ||
4221 | unsigned long count = 0; /* Number of blocks in the run */ | ||
4222 | __le32 *block_to_free_p = NULL; /* Pointer into inode/ind | ||
4223 | corresponding to | ||
4224 | block_to_free */ | ||
4225 | ext4_fsblk_t nr; /* Current block # */ | ||
4226 | __le32 *p; /* Pointer into inode/ind | ||
4227 | for current block */ | ||
4228 | int err = 0; | ||
4229 | |||
4230 | if (this_bh) { /* For indirect block */ | ||
4231 | BUFFER_TRACE(this_bh, "get_write_access"); | ||
4232 | err = ext4_journal_get_write_access(handle, this_bh); | ||
4233 | /* Important: if we can't update the indirect pointers | ||
4234 | * to the blocks, we can't free them. */ | ||
4235 | if (err) | ||
4236 | return; | ||
4237 | } | ||
4238 | |||
4239 | for (p = first; p < last; p++) { | ||
4240 | nr = le32_to_cpu(*p); | ||
4241 | if (nr) { | ||
4242 | /* accumulate blocks to free if they're contiguous */ | ||
4243 | if (count == 0) { | ||
4244 | block_to_free = nr; | ||
4245 | block_to_free_p = p; | ||
4246 | count = 1; | ||
4247 | } else if (nr == block_to_free + count) { | ||
4248 | count++; | ||
4249 | } else { | ||
4250 | err = ext4_clear_blocks(handle, inode, this_bh, | ||
4251 | block_to_free, count, | ||
4252 | block_to_free_p, p); | ||
4253 | if (err) | ||
4254 | break; | ||
4255 | block_to_free = nr; | ||
4256 | block_to_free_p = p; | ||
4257 | count = 1; | ||
4258 | } | ||
4259 | } | ||
4260 | } | ||
4261 | |||
4262 | if (!err && count > 0) | ||
4263 | err = ext4_clear_blocks(handle, inode, this_bh, block_to_free, | ||
4264 | count, block_to_free_p, p); | ||
4265 | if (err < 0) | ||
4266 | /* fatal error */ | ||
4267 | return; | ||
4268 | |||
4269 | if (this_bh) { | ||
4270 | BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); | ||
4271 | |||
4272 | /* | ||
4273 | * The buffer head should have an attached journal head at this | ||
4274 | * point. However, if the data is corrupted and an indirect | ||
4275 | * block pointed to itself, it would have been detached when | ||
4276 | * the block was cleared. Check for this instead of OOPSing. | ||
4277 | */ | ||
4278 | if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) | ||
4279 | ext4_handle_dirty_metadata(handle, inode, this_bh); | ||
4280 | else | ||
4281 | EXT4_ERROR_INODE(inode, | ||
4282 | "circular indirect block detected at " | ||
4283 | "block %llu", | ||
4284 | (unsigned long long) this_bh->b_blocknr); | ||
4285 | } | ||
4286 | } | ||
4287 | |||
4288 | /** | ||
4289 | * ext4_free_branches - free an array of branches | ||
4290 | * @handle: JBD handle for this transaction | ||
4291 | * @inode: inode we are dealing with | ||
4292 | * @parent_bh: the buffer_head which contains *@first and *@last | ||
4293 | * @first: array of block numbers | ||
4294 | * @last: pointer immediately past the end of array | ||
4295 | * @depth: depth of the branches to free | ||
4296 | * | ||
4297 | * We are freeing all blocks referred from these branches (numbers are | ||
4298 | * stored as little-endian 32-bit) and updating @inode->i_blocks | ||
4299 | * appropriately. | ||
4300 | */ | ||
4301 | static void ext4_free_branches(handle_t *handle, struct inode *inode, | ||
4302 | struct buffer_head *parent_bh, | ||
4303 | __le32 *first, __le32 *last, int depth) | ||
4304 | { | ||
4305 | ext4_fsblk_t nr; | ||
4306 | __le32 *p; | ||
4307 | |||
4308 | if (ext4_handle_is_aborted(handle)) | ||
4309 | return; | ||
4310 | |||
4311 | if (depth--) { | ||
4312 | struct buffer_head *bh; | ||
4313 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4314 | p = last; | ||
4315 | while (--p >= first) { | ||
4316 | nr = le32_to_cpu(*p); | ||
4317 | if (!nr) | ||
4318 | continue; /* A hole */ | ||
4319 | |||
4320 | if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), | ||
4321 | nr, 1)) { | ||
4322 | EXT4_ERROR_INODE(inode, | ||
4323 | "invalid indirect mapped " | ||
4324 | "block %lu (level %d)", | ||
4325 | (unsigned long) nr, depth); | ||
4326 | break; | ||
4327 | } | ||
4328 | |||
4329 | /* Go read the buffer for the next level down */ | ||
4330 | bh = sb_bread(inode->i_sb, nr); | ||
4331 | |||
4332 | /* | ||
4333 | * A read failure? Report error and clear slot | ||
4334 | * (should be rare). | ||
4335 | */ | ||
4336 | if (!bh) { | ||
4337 | EXT4_ERROR_INODE_BLOCK(inode, nr, | ||
4338 | "Read failure"); | ||
4339 | continue; | ||
4340 | } | ||
4341 | |||
4342 | /* This zaps the entire block. Bottom up. */ | ||
4343 | BUFFER_TRACE(bh, "free child branches"); | ||
4344 | ext4_free_branches(handle, inode, bh, | ||
4345 | (__le32 *) bh->b_data, | ||
4346 | (__le32 *) bh->b_data + addr_per_block, | ||
4347 | depth); | ||
4348 | brelse(bh); | ||
4349 | |||
4350 | /* | ||
4351 | * Everything below this this pointer has been | ||
4352 | * released. Now let this top-of-subtree go. | ||
4353 | * | ||
4354 | * We want the freeing of this indirect block to be | ||
4355 | * atomic in the journal with the updating of the | ||
4356 | * bitmap block which owns it. So make some room in | ||
4357 | * the journal. | ||
4358 | * | ||
4359 | * We zero the parent pointer *after* freeing its | ||
4360 | * pointee in the bitmaps, so if extend_transaction() | ||
4361 | * for some reason fails to put the bitmap changes and | ||
4362 | * the release into the same transaction, recovery | ||
4363 | * will merely complain about releasing a free block, | ||
4364 | * rather than leaking blocks. | ||
4365 | */ | ||
4366 | if (ext4_handle_is_aborted(handle)) | ||
4367 | return; | ||
4368 | if (try_to_extend_transaction(handle, inode)) { | ||
4369 | ext4_mark_inode_dirty(handle, inode); | ||
4370 | ext4_truncate_restart_trans(handle, inode, | ||
4371 | blocks_for_truncate(inode)); | ||
4372 | } | ||
4373 | |||
4374 | /* | ||
4375 | * The forget flag here is critical because if | ||
4376 | * we are journaling (and not doing data | ||
4377 | * journaling), we have to make sure a revoke | ||
4378 | * record is written to prevent the journal | ||
4379 | * replay from overwriting the (former) | ||
4380 | * indirect block if it gets reallocated as a | ||
4381 | * data block. This must happen in the same | ||
4382 | * transaction where the data blocks are | ||
4383 | * actually freed. | ||
4384 | */ | ||
4385 | ext4_free_blocks(handle, inode, NULL, nr, 1, | ||
4386 | EXT4_FREE_BLOCKS_METADATA| | ||
4387 | EXT4_FREE_BLOCKS_FORGET); | ||
4388 | |||
4389 | if (parent_bh) { | ||
4390 | /* | ||
4391 | * The block which we have just freed is | ||
4392 | * pointed to by an indirect block: journal it | ||
4393 | */ | ||
4394 | BUFFER_TRACE(parent_bh, "get_write_access"); | ||
4395 | if (!ext4_journal_get_write_access(handle, | ||
4396 | parent_bh)){ | ||
4397 | *p = 0; | ||
4398 | BUFFER_TRACE(parent_bh, | ||
4399 | "call ext4_handle_dirty_metadata"); | ||
4400 | ext4_handle_dirty_metadata(handle, | ||
4401 | inode, | ||
4402 | parent_bh); | ||
4403 | } | ||
4404 | } | ||
4405 | } | ||
4406 | } else { | ||
4407 | /* We have reached the bottom of the tree. */ | ||
4408 | BUFFER_TRACE(parent_bh, "free data blocks"); | ||
4409 | ext4_free_data(handle, inode, parent_bh, first, last); | ||
4410 | } | ||
4411 | } | ||
4412 | |||
4413 | int ext4_can_truncate(struct inode *inode) | 3066 | int ext4_can_truncate(struct inode *inode) |
4414 | { | 3067 | { |
4415 | if (S_ISREG(inode->i_mode)) | 3068 | if (S_ISREG(inode->i_mode)) |
@@ -4476,19 +3129,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4476 | */ | 3129 | */ |
4477 | void ext4_truncate(struct inode *inode) | 3130 | void ext4_truncate(struct inode *inode) |
4478 | { | 3131 | { |
4479 | handle_t *handle; | ||
4480 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
4481 | __le32 *i_data = ei->i_data; | ||
4482 | int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
4483 | struct address_space *mapping = inode->i_mapping; | ||
4484 | ext4_lblk_t offsets[4]; | ||
4485 | Indirect chain[4]; | ||
4486 | Indirect *partial; | ||
4487 | __le32 nr = 0; | ||
4488 | int n = 0; | ||
4489 | ext4_lblk_t last_block, max_block; | ||
4490 | unsigned blocksize = inode->i_sb->s_blocksize; | ||
4491 | |||
4492 | trace_ext4_truncate_enter(inode); | 3132 | trace_ext4_truncate_enter(inode); |
4493 | 3133 | ||
4494 | if (!ext4_can_truncate(inode)) | 3134 | if (!ext4_can_truncate(inode)) |
@@ -4499,149 +3139,11 @@ void ext4_truncate(struct inode *inode) | |||
4499 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) | 3139 | if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) |
4500 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); | 3140 | ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); |
4501 | 3141 | ||
4502 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { | 3142 | if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) |
4503 | ext4_ext_truncate(inode); | 3143 | ext4_ext_truncate(inode); |
4504 | trace_ext4_truncate_exit(inode); | 3144 | else |
4505 | return; | 3145 | ext4_ind_truncate(inode); |
4506 | } | ||
4507 | |||
4508 | handle = start_transaction(inode); | ||
4509 | if (IS_ERR(handle)) | ||
4510 | return; /* AKPM: return what? */ | ||
4511 | |||
4512 | last_block = (inode->i_size + blocksize-1) | ||
4513 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4514 | max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | ||
4515 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | ||
4516 | |||
4517 | if (inode->i_size & (blocksize - 1)) | ||
4518 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | ||
4519 | goto out_stop; | ||
4520 | |||
4521 | if (last_block != max_block) { | ||
4522 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | ||
4523 | if (n == 0) | ||
4524 | goto out_stop; /* error */ | ||
4525 | } | ||
4526 | |||
4527 | /* | ||
4528 | * OK. This truncate is going to happen. We add the inode to the | ||
4529 | * orphan list, so that if this truncate spans multiple transactions, | ||
4530 | * and we crash, we will resume the truncate when the filesystem | ||
4531 | * recovers. It also marks the inode dirty, to catch the new size. | ||
4532 | * | ||
4533 | * Implication: the file must always be in a sane, consistent | ||
4534 | * truncatable state while each transaction commits. | ||
4535 | */ | ||
4536 | if (ext4_orphan_add(handle, inode)) | ||
4537 | goto out_stop; | ||
4538 | |||
4539 | /* | ||
4540 | * From here we block out all ext4_get_block() callers who want to | ||
4541 | * modify the block allocation tree. | ||
4542 | */ | ||
4543 | down_write(&ei->i_data_sem); | ||
4544 | |||
4545 | ext4_discard_preallocations(inode); | ||
4546 | |||
4547 | /* | ||
4548 | * The orphan list entry will now protect us from any crash which | ||
4549 | * occurs before the truncate completes, so it is now safe to propagate | ||
4550 | * the new, shorter inode size (held for now in i_size) into the | ||
4551 | * on-disk inode. We do this via i_disksize, which is the value which | ||
4552 | * ext4 *really* writes onto the disk inode. | ||
4553 | */ | ||
4554 | ei->i_disksize = inode->i_size; | ||
4555 | |||
4556 | if (last_block == max_block) { | ||
4557 | /* | ||
4558 | * It is unnecessary to free any data blocks if last_block is | ||
4559 | * equal to the indirect block limit. | ||
4560 | */ | ||
4561 | goto out_unlock; | ||
4562 | } else if (n == 1) { /* direct blocks */ | ||
4563 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | ||
4564 | i_data + EXT4_NDIR_BLOCKS); | ||
4565 | goto do_indirects; | ||
4566 | } | ||
4567 | |||
4568 | partial = ext4_find_shared(inode, n, offsets, chain, &nr); | ||
4569 | /* Kill the top of shared branch (not detached) */ | ||
4570 | if (nr) { | ||
4571 | if (partial == chain) { | ||
4572 | /* Shared branch grows from the inode */ | ||
4573 | ext4_free_branches(handle, inode, NULL, | ||
4574 | &nr, &nr+1, (chain+n-1) - partial); | ||
4575 | *partial->p = 0; | ||
4576 | /* | ||
4577 | * We mark the inode dirty prior to restart, | ||
4578 | * and prior to stop. No need for it here. | ||
4579 | */ | ||
4580 | } else { | ||
4581 | /* Shared branch grows from an indirect block */ | ||
4582 | BUFFER_TRACE(partial->bh, "get_write_access"); | ||
4583 | ext4_free_branches(handle, inode, partial->bh, | ||
4584 | partial->p, | ||
4585 | partial->p+1, (chain+n-1) - partial); | ||
4586 | } | ||
4587 | } | ||
4588 | /* Clear the ends of indirect blocks on the shared branch */ | ||
4589 | while (partial > chain) { | ||
4590 | ext4_free_branches(handle, inode, partial->bh, partial->p + 1, | ||
4591 | (__le32*)partial->bh->b_data+addr_per_block, | ||
4592 | (chain+n-1) - partial); | ||
4593 | BUFFER_TRACE(partial->bh, "call brelse"); | ||
4594 | brelse(partial->bh); | ||
4595 | partial--; | ||
4596 | } | ||
4597 | do_indirects: | ||
4598 | /* Kill the remaining (whole) subtrees */ | ||
4599 | switch (offsets[0]) { | ||
4600 | default: | ||
4601 | nr = i_data[EXT4_IND_BLOCK]; | ||
4602 | if (nr) { | ||
4603 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1); | ||
4604 | i_data[EXT4_IND_BLOCK] = 0; | ||
4605 | } | ||
4606 | case EXT4_IND_BLOCK: | ||
4607 | nr = i_data[EXT4_DIND_BLOCK]; | ||
4608 | if (nr) { | ||
4609 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2); | ||
4610 | i_data[EXT4_DIND_BLOCK] = 0; | ||
4611 | } | ||
4612 | case EXT4_DIND_BLOCK: | ||
4613 | nr = i_data[EXT4_TIND_BLOCK]; | ||
4614 | if (nr) { | ||
4615 | ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3); | ||
4616 | i_data[EXT4_TIND_BLOCK] = 0; | ||
4617 | } | ||
4618 | case EXT4_TIND_BLOCK: | ||
4619 | ; | ||
4620 | } | ||
4621 | |||
4622 | out_unlock: | ||
4623 | up_write(&ei->i_data_sem); | ||
4624 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | ||
4625 | ext4_mark_inode_dirty(handle, inode); | ||
4626 | |||
4627 | /* | ||
4628 | * In a multi-transaction truncate, we only make the final transaction | ||
4629 | * synchronous | ||
4630 | */ | ||
4631 | if (IS_SYNC(inode)) | ||
4632 | ext4_handle_sync(handle); | ||
4633 | out_stop: | ||
4634 | /* | ||
4635 | * If this was a simple ftruncate(), and the file will remain alive | ||
4636 | * then we need to clear up the orphan record which we created above. | ||
4637 | * However, if this was a real unlink then we were called by | ||
4638 | * ext4_delete_inode(), and we allow that function to clean up the | ||
4639 | * orphan info for us. | ||
4640 | */ | ||
4641 | if (inode->i_nlink) | ||
4642 | ext4_orphan_del(handle, inode); | ||
4643 | 3146 | ||
4644 | ext4_journal_stop(handle); | ||
4645 | trace_ext4_truncate_exit(inode); | 3147 | trace_ext4_truncate_exit(inode); |
4646 | } | 3148 | } |
4647 | 3149 | ||
@@ -5012,7 +3514,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) | |||
5012 | (S_ISLNK(inode->i_mode) && | 3514 | (S_ISLNK(inode->i_mode) && |
5013 | !ext4_inode_is_fast_symlink(inode))) { | 3515 | !ext4_inode_is_fast_symlink(inode))) { |
5014 | /* Validate block references which are part of inode */ | 3516 | /* Validate block references which are part of inode */ |
5015 | ret = ext4_check_inode_blockref(inode); | 3517 | ret = ext4_ind_check_inode(inode); |
5016 | } | 3518 | } |
5017 | if (ret) | 3519 | if (ret) |
5018 | goto bad_inode; | 3520 | goto bad_inode; |
@@ -5459,34 +3961,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
5459 | return 0; | 3961 | return 0; |
5460 | } | 3962 | } |
5461 | 3963 | ||
5462 | static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | ||
5463 | int chunk) | ||
5464 | { | ||
5465 | int indirects; | ||
5466 | |||
5467 | /* if nrblocks are contiguous */ | ||
5468 | if (chunk) { | ||
5469 | /* | ||
5470 | * With N contiguous data blocks, we need at most | ||
5471 | * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | ||
5472 | * 2 dindirect blocks, and 1 tindirect block | ||
5473 | */ | ||
5474 | return DIV_ROUND_UP(nrblocks, | ||
5475 | EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | ||
5476 | } | ||
5477 | /* | ||
5478 | * if nrblocks are not contiguous, worse case, each block touch | ||
5479 | * a indirect block, and each indirect block touch a double indirect | ||
5480 | * block, plus a triple indirect block | ||
5481 | */ | ||
5482 | indirects = nrblocks * 2 + 1; | ||
5483 | return indirects; | ||
5484 | } | ||
5485 | |||
5486 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | 3964 | static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) |
5487 | { | 3965 | { |
5488 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) | 3966 | if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) |
5489 | return ext4_indirect_trans_blocks(inode, nrblocks, chunk); | 3967 | return ext4_ind_trans_blocks(inode, nrblocks, chunk); |
5490 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); | 3968 | return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); |
5491 | } | 3969 | } |
5492 | 3970 | ||
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 808c554e773f..f18bfe37aff8 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c | |||
@@ -202,8 +202,9 @@ setversion_out: | |||
202 | struct super_block *sb = inode->i_sb; | 202 | struct super_block *sb = inode->i_sb; |
203 | int err, err2=0; | 203 | int err, err2=0; |
204 | 204 | ||
205 | if (!capable(CAP_SYS_RESOURCE)) | 205 | err = ext4_resize_begin(sb); |
206 | return -EPERM; | 206 | if (err) |
207 | return err; | ||
207 | 208 | ||
208 | if (get_user(n_blocks_count, (__u32 __user *)arg)) | 209 | if (get_user(n_blocks_count, (__u32 __user *)arg)) |
209 | return -EFAULT; | 210 | return -EFAULT; |
@@ -221,6 +222,7 @@ setversion_out: | |||
221 | if (err == 0) | 222 | if (err == 0) |
222 | err = err2; | 223 | err = err2; |
223 | mnt_drop_write(filp->f_path.mnt); | 224 | mnt_drop_write(filp->f_path.mnt); |
225 | ext4_resize_end(sb); | ||
224 | 226 | ||
225 | return err; | 227 | return err; |
226 | } | 228 | } |
@@ -271,8 +273,9 @@ mext_out: | |||
271 | struct super_block *sb = inode->i_sb; | 273 | struct super_block *sb = inode->i_sb; |
272 | int err, err2=0; | 274 | int err, err2=0; |
273 | 275 | ||
274 | if (!capable(CAP_SYS_RESOURCE)) | 276 | err = ext4_resize_begin(sb); |
275 | return -EPERM; | 277 | if (err) |
278 | return err; | ||
276 | 279 | ||
277 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, | 280 | if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, |
278 | sizeof(input))) | 281 | sizeof(input))) |
@@ -291,6 +294,7 @@ mext_out: | |||
291 | if (err == 0) | 294 | if (err == 0) |
292 | err = err2; | 295 | err = err2; |
293 | mnt_drop_write(filp->f_path.mnt); | 296 | mnt_drop_write(filp->f_path.mnt); |
297 | ext4_resize_end(sb); | ||
294 | 298 | ||
295 | return err; | 299 | return err; |
296 | } | 300 | } |
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6ed859d56850..17a5a57c415a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -75,8 +75,8 @@ | |||
75 | * | 75 | * |
76 | * The inode preallocation space is used looking at the _logical_ start | 76 | * The inode preallocation space is used looking at the _logical_ start |
77 | * block. If only the logical file block falls within the range of prealloc | 77 | * block. If only the logical file block falls within the range of prealloc |
78 | * space we will consume the particular prealloc space. This make sure that | 78 | * space we will consume the particular prealloc space. This makes sure that |
79 | * that the we have contiguous physical blocks representing the file blocks | 79 | * we have contiguous physical blocks representing the file blocks |
80 | * | 80 | * |
81 | * The important thing to be noted in case of inode prealloc space is that | 81 | * The important thing to be noted in case of inode prealloc space is that |
82 | * we don't modify the values associated to inode prealloc space except | 82 | * we don't modify the values associated to inode prealloc space except |
@@ -84,7 +84,7 @@ | |||
84 | * | 84 | * |
85 | * If we are not able to find blocks in the inode prealloc space and if we | 85 | * If we are not able to find blocks in the inode prealloc space and if we |
86 | * have the group allocation flag set then we look at the locality group | 86 | * have the group allocation flag set then we look at the locality group |
87 | * prealloc space. These are per CPU prealloc list repreasented as | 87 | * prealloc space. These are per CPU prealloc list represented as |
88 | * | 88 | * |
89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] | 89 | * ext4_sb_info.s_locality_groups[smp_processor_id()] |
90 | * | 90 | * |
@@ -128,12 +128,13 @@ | |||
128 | * we are doing a group prealloc we try to normalize the request to | 128 | * we are doing a group prealloc we try to normalize the request to |
129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is | 129 | * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is |
130 | * 512 blocks. This can be tuned via | 130 | * 512 blocks. This can be tuned via |
131 | * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in | 131 | * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in |
132 | * terms of number of blocks. If we have mounted the file system with -O | 132 | * terms of number of blocks. If we have mounted the file system with -O |
133 | * stripe=<value> option the group prealloc request is normalized to the | 133 | * stripe=<value> option the group prealloc request is normalized to the |
134 | * stripe value (sbi->s_stripe) | 134 | * the smallest multiple of the stripe value (sbi->s_stripe) which is |
135 | * greater than the default mb_group_prealloc. | ||
135 | * | 136 | * |
136 | * The regular allocator(using the buddy cache) supports few tunables. | 137 | * The regular allocator (using the buddy cache) supports a few tunables. |
137 | * | 138 | * |
138 | * /sys/fs/ext4/<partition>/mb_min_to_scan | 139 | * /sys/fs/ext4/<partition>/mb_min_to_scan |
139 | * /sys/fs/ext4/<partition>/mb_max_to_scan | 140 | * /sys/fs/ext4/<partition>/mb_max_to_scan |
@@ -152,7 +153,7 @@ | |||
152 | * best extent in the found extents. Searching for the blocks starts with | 153 | * best extent in the found extents. Searching for the blocks starts with |
153 | * the group specified as the goal value in allocation context via | 154 | * the group specified as the goal value in allocation context via |
154 | * ac_g_ex. Each group is first checked based on the criteria whether it | 155 | * ac_g_ex. Each group is first checked based on the criteria whether it |
155 | * can used for allocation. ext4_mb_good_group explains how the groups are | 156 | * can be used for allocation. ext4_mb_good_group explains how the groups are |
156 | * checked. | 157 | * checked. |
157 | * | 158 | * |
158 | * Both the prealloc space are getting populated as above. So for the first | 159 | * Both the prealloc space are getting populated as above. So for the first |
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) | |||
492 | b2 = (unsigned char *) bitmap; | 493 | b2 = (unsigned char *) bitmap; |
493 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { | 494 | for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { |
494 | if (b1[i] != b2[i]) { | 495 | if (b1[i] != b2[i]) { |
495 | printk(KERN_ERR "corruption in group %u " | 496 | ext4_msg(e4b->bd_sb, KERN_ERR, |
496 | "at byte %u(%u): %x in copy != %x " | 497 | "corruption in group %u " |
497 | "on disk/prealloc\n", | 498 | "at byte %u(%u): %x in copy != %x " |
498 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | 499 | "on disk/prealloc", |
500 | e4b->bd_group, i, i * 8, b1[i], b2[i]); | ||
499 | BUG(); | 501 | BUG(); |
500 | } | 502 | } |
501 | } | 503 | } |
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
1125 | grp = ext4_get_group_info(sb, group); | 1127 | grp = ext4_get_group_info(sb, group); |
1126 | 1128 | ||
1127 | e4b->bd_blkbits = sb->s_blocksize_bits; | 1129 | e4b->bd_blkbits = sb->s_blocksize_bits; |
1128 | e4b->bd_info = ext4_get_group_info(sb, group); | 1130 | e4b->bd_info = grp; |
1129 | e4b->bd_sb = sb; | 1131 | e4b->bd_sb = sb; |
1130 | e4b->bd_group = group; | 1132 | e4b->bd_group = group; |
1131 | e4b->bd_buddy_page = NULL; | 1133 | e4b->bd_buddy_page = NULL; |
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len) | |||
1281 | } | 1283 | } |
1282 | } | 1284 | } |
1283 | 1285 | ||
1284 | static void mb_set_bits(void *bm, int cur, int len) | 1286 | void ext4_set_bits(void *bm, int cur, int len) |
1285 | { | 1287 | { |
1286 | __u32 *addr; | 1288 | __u32 *addr; |
1287 | 1289 | ||
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) | |||
1510 | } | 1512 | } |
1511 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); | 1513 | mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); |
1512 | 1514 | ||
1513 | mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); | 1515 | ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); |
1514 | mb_check_buddy(e4b); | 1516 | mb_check_buddy(e4b); |
1515 | 1517 | ||
1516 | return ret; | 1518 | return ret; |
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2223 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2225 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2224 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | 2226 | meta_group_info = kmalloc(metalen, GFP_KERNEL); |
2225 | if (meta_group_info == NULL) { | 2227 | if (meta_group_info == NULL) { |
2226 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | 2228 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem " |
2227 | "buddy group\n"); | 2229 | "for a buddy group"); |
2228 | goto exit_meta_group_info; | 2230 | goto exit_meta_group_info; |
2229 | } | 2231 | } |
2230 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = | 2232 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = |
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2237 | 2239 | ||
2238 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); | 2240 | meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); |
2239 | if (meta_group_info[i] == NULL) { | 2241 | if (meta_group_info[i] == NULL) { |
2240 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | 2242 | ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem"); |
2241 | goto exit_group_info; | 2243 | goto exit_group_info; |
2242 | } | 2244 | } |
2243 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); | 2245 | memset(meta_group_info[i], 0, kmem_cache_size(cachep)); |
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |||
2279 | 2281 | ||
2280 | exit_group_info: | 2282 | exit_group_info: |
2281 | /* If a meta_group_info table has been allocated, release it now */ | 2283 | /* If a meta_group_info table has been allocated, release it now */ |
2282 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) | 2284 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { |
2283 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); | 2285 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); |
2286 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; | ||
2287 | } | ||
2284 | exit_meta_group_info: | 2288 | exit_meta_group_info: |
2285 | return -ENOMEM; | 2289 | return -ENOMEM; |
2286 | } /* ext4_mb_add_groupinfo */ | 2290 | } /* ext4_mb_add_groupinfo */ |
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb) | |||
2328 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | 2332 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte |
2329 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | 2333 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. |
2330 | * So a two level scheme suffices for now. */ | 2334 | * So a two level scheme suffices for now. */ |
2331 | sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); | 2335 | sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL); |
2332 | if (sbi->s_group_info == NULL) { | 2336 | if (sbi->s_group_info == NULL) { |
2333 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | 2337 | ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); |
2334 | return -ENOMEM; | 2338 | return -ENOMEM; |
2335 | } | 2339 | } |
2336 | sbi->s_buddy_cache = new_inode(sb); | 2340 | sbi->s_buddy_cache = new_inode(sb); |
2337 | if (sbi->s_buddy_cache == NULL) { | 2341 | if (sbi->s_buddy_cache == NULL) { |
2338 | printk(KERN_ERR "EXT4-fs: can't get new inode\n"); | 2342 | ext4_msg(sb, KERN_ERR, "can't get new inode"); |
2339 | goto err_freesgi; | 2343 | goto err_freesgi; |
2340 | } | 2344 | } |
2341 | sbi->s_buddy_cache->i_ino = get_next_ino(); | 2345 | /* To avoid potentially colliding with an valid on-disk inode number, |
2346 | * use EXT4_BAD_INO for the buddy cache inode number. This inode is | ||
2347 | * not in the inode hash, so it should never be found by iget(), but | ||
2348 | * this will avoid confusion if it ever shows up during debugging. */ | ||
2349 | sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; | ||
2342 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; | 2350 | EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; |
2343 | for (i = 0; i < ngroups; i++) { | 2351 | for (i = 0; i < ngroups; i++) { |
2344 | desc = ext4_get_group_desc(sb, i, NULL); | 2352 | desc = ext4_get_group_desc(sb, i, NULL); |
2345 | if (desc == NULL) { | 2353 | if (desc == NULL) { |
2346 | printk(KERN_ERR | 2354 | ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); |
2347 | "EXT4-fs: can't read descriptor %u\n", i); | ||
2348 | goto err_freebuddy; | 2355 | goto err_freebuddy; |
2349 | } | 2356 | } |
2350 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) | 2357 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
@@ -2362,7 +2369,7 @@ err_freebuddy: | |||
2362 | kfree(sbi->s_group_info[i]); | 2369 | kfree(sbi->s_group_info[i]); |
2363 | iput(sbi->s_buddy_cache); | 2370 | iput(sbi->s_buddy_cache); |
2364 | err_freesgi: | 2371 | err_freesgi: |
2365 | kfree(sbi->s_group_info); | 2372 | ext4_kvfree(sbi->s_group_info); |
2366 | return -ENOMEM; | 2373 | return -ENOMEM; |
2367 | } | 2374 | } |
2368 | 2375 | ||
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size) | |||
2404 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, | 2411 | slab_size, 0, SLAB_RECLAIM_ACCOUNT, |
2405 | NULL); | 2412 | NULL); |
2406 | 2413 | ||
2414 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2415 | |||
2407 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); | 2416 | mutex_unlock(&ext4_grpinfo_slab_create_mutex); |
2408 | if (!cachep) { | 2417 | if (!cachep) { |
2409 | printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); | 2418 | printk(KERN_EMERG |
2419 | "EXT4-fs: no memory for groupinfo slab cache\n"); | ||
2410 | return -ENOMEM; | 2420 | return -ENOMEM; |
2411 | } | 2421 | } |
2412 | 2422 | ||
2413 | ext4_groupinfo_caches[cache_index] = cachep; | ||
2414 | |||
2415 | return 0; | 2423 | return 0; |
2416 | } | 2424 | } |
2417 | 2425 | ||
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2457 | i++; | 2465 | i++; |
2458 | } while (i <= sb->s_blocksize_bits + 1); | 2466 | } while (i <= sb->s_blocksize_bits + 1); |
2459 | 2467 | ||
2460 | /* init file for buddy data */ | ||
2461 | ret = ext4_mb_init_backend(sb); | ||
2462 | if (ret != 0) { | ||
2463 | goto out; | ||
2464 | } | ||
2465 | |||
2466 | spin_lock_init(&sbi->s_md_lock); | 2468 | spin_lock_init(&sbi->s_md_lock); |
2467 | spin_lock_init(&sbi->s_bal_lock); | 2469 | spin_lock_init(&sbi->s_bal_lock); |
2468 | 2470 | ||
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2472 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; | 2474 | sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; |
2473 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; | 2475 | sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; |
2474 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | 2476 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; |
2477 | /* | ||
2478 | * If there is a s_stripe > 1, then we set the s_mb_group_prealloc | ||
2479 | * to the lowest multiple of s_stripe which is bigger than | ||
2480 | * the s_mb_group_prealloc as determined above. We want | ||
2481 | * the preallocation size to be an exact multiple of the | ||
2482 | * RAID stripe size so that preallocations don't fragment | ||
2483 | * the stripes. | ||
2484 | */ | ||
2485 | if (sbi->s_stripe > 1) { | ||
2486 | sbi->s_mb_group_prealloc = roundup( | ||
2487 | sbi->s_mb_group_prealloc, sbi->s_stripe); | ||
2488 | } | ||
2475 | 2489 | ||
2476 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); | 2490 | sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); |
2477 | if (sbi->s_locality_groups == NULL) { | 2491 | if (sbi->s_locality_groups == NULL) { |
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2487 | spin_lock_init(&lg->lg_prealloc_lock); | 2501 | spin_lock_init(&lg->lg_prealloc_lock); |
2488 | } | 2502 | } |
2489 | 2503 | ||
2504 | /* init file for buddy data */ | ||
2505 | ret = ext4_mb_init_backend(sb); | ||
2506 | if (ret != 0) { | ||
2507 | goto out; | ||
2508 | } | ||
2509 | |||
2490 | if (sbi->s_proc) | 2510 | if (sbi->s_proc) |
2491 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, | 2511 | proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, |
2492 | &ext4_mb_seq_groups_fops, sb); | 2512 | &ext4_mb_seq_groups_fops, sb); |
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb) | |||
2544 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2564 | EXT4_DESC_PER_BLOCK_BITS(sb); |
2545 | for (i = 0; i < num_meta_group_infos; i++) | 2565 | for (i = 0; i < num_meta_group_infos; i++) |
2546 | kfree(sbi->s_group_info[i]); | 2566 | kfree(sbi->s_group_info[i]); |
2547 | kfree(sbi->s_group_info); | 2567 | ext4_kvfree(sbi->s_group_info); |
2548 | } | 2568 | } |
2549 | kfree(sbi->s_mb_offsets); | 2569 | kfree(sbi->s_mb_offsets); |
2550 | kfree(sbi->s_mb_maxs); | 2570 | kfree(sbi->s_mb_maxs); |
2551 | if (sbi->s_buddy_cache) | 2571 | if (sbi->s_buddy_cache) |
2552 | iput(sbi->s_buddy_cache); | 2572 | iput(sbi->s_buddy_cache); |
2553 | if (sbi->s_mb_stats) { | 2573 | if (sbi->s_mb_stats) { |
2554 | printk(KERN_INFO | 2574 | ext4_msg(sb, KERN_INFO, |
2555 | "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", | 2575 | "mballoc: %u blocks %u reqs (%u success)", |
2556 | atomic_read(&sbi->s_bal_allocated), | 2576 | atomic_read(&sbi->s_bal_allocated), |
2557 | atomic_read(&sbi->s_bal_reqs), | 2577 | atomic_read(&sbi->s_bal_reqs), |
2558 | atomic_read(&sbi->s_bal_success)); | 2578 | atomic_read(&sbi->s_bal_success)); |
2559 | printk(KERN_INFO | 2579 | ext4_msg(sb, KERN_INFO, |
2560 | "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " | 2580 | "mballoc: %u extents scanned, %u goal hits, " |
2561 | "%u 2^N hits, %u breaks, %u lost\n", | 2581 | "%u 2^N hits, %u breaks, %u lost", |
2562 | atomic_read(&sbi->s_bal_ex_scanned), | 2582 | atomic_read(&sbi->s_bal_ex_scanned), |
2563 | atomic_read(&sbi->s_bal_goals), | 2583 | atomic_read(&sbi->s_bal_goals), |
2564 | atomic_read(&sbi->s_bal_2orders), | 2584 | atomic_read(&sbi->s_bal_2orders), |
2565 | atomic_read(&sbi->s_bal_breaks), | 2585 | atomic_read(&sbi->s_bal_breaks), |
2566 | atomic_read(&sbi->s_mb_lost_chunks)); | 2586 | atomic_read(&sbi->s_mb_lost_chunks)); |
2567 | printk(KERN_INFO | 2587 | ext4_msg(sb, KERN_INFO, |
2568 | "EXT4-fs: mballoc: %lu generated and it took %Lu\n", | 2588 | "mballoc: %lu generated and it took %Lu", |
2569 | sbi->s_mb_buddies_generated++, | 2589 | sbi->s_mb_buddies_generated, |
2570 | sbi->s_mb_generation_time); | 2590 | sbi->s_mb_generation_time); |
2571 | printk(KERN_INFO | 2591 | ext4_msg(sb, KERN_INFO, |
2572 | "EXT4-fs: mballoc: %u preallocated, %u discarded\n", | 2592 | "mballoc: %u preallocated, %u discarded", |
2573 | atomic_read(&sbi->s_mb_preallocated), | 2593 | atomic_read(&sbi->s_mb_preallocated), |
2574 | atomic_read(&sbi->s_mb_discarded)); | 2594 | atomic_read(&sbi->s_mb_discarded)); |
2575 | } | 2595 | } |
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) | |||
2628 | rb_erase(&entry->node, &(db->bb_free_root)); | 2648 | rb_erase(&entry->node, &(db->bb_free_root)); |
2629 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); | 2649 | mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); |
2630 | 2650 | ||
2651 | /* | ||
2652 | * Clear the trimmed flag for the group so that the next | ||
2653 | * ext4_trim_fs can trim it. | ||
2654 | * If the volume is mounted with -o discard, online discard | ||
2655 | * is supported and the free blocks will be trimmed online. | ||
2656 | */ | ||
2657 | if (!test_opt(sb, DISCARD)) | ||
2658 | EXT4_MB_GRP_CLEAR_TRIMMED(db); | ||
2659 | |||
2631 | if (!db->bb_free_root.rb_node) { | 2660 | if (!db->bb_free_root.rb_node) { |
2632 | /* No more items in the per group rb tree | 2661 | /* No more items in the per group rb tree |
2633 | * balance refcounts from ext4_mb_free_metadata() | 2662 | * balance refcounts from ext4_mb_free_metadata() |
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2771 | * We leak some of the blocks here. | 2800 | * We leak some of the blocks here. |
2772 | */ | 2801 | */ |
2773 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); | 2802 | ext4_lock_group(sb, ac->ac_b_ex.fe_group); |
2774 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, | 2803 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2775 | ac->ac_b_ex.fe_len); | 2804 | ac->ac_b_ex.fe_len); |
2776 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | 2805 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); |
2777 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); | 2806 | err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); |
2778 | if (!err) | 2807 | if (!err) |
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2790 | } | 2819 | } |
2791 | } | 2820 | } |
2792 | #endif | 2821 | #endif |
2793 | mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); | 2822 | ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, |
2823 | ac->ac_b_ex.fe_len); | ||
2794 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 2824 | if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
2795 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); | 2825 | gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); |
2796 | ext4_free_blks_set(sb, gdp, | 2826 | ext4_free_blks_set(sb, gdp, |
@@ -2830,8 +2860,9 @@ out_err: | |||
2830 | 2860 | ||
2831 | /* | 2861 | /* |
2832 | * here we normalize request for locality group | 2862 | * here we normalize request for locality group |
2833 | * Group request are normalized to s_strip size if we set the same via mount | 2863 | * Group request are normalized to s_mb_group_prealloc, which goes to |
2834 | * option. If not we set it to s_mb_group_prealloc which can be configured via | 2864 | * s_strip if we set the same via mount option. |
2865 | * s_mb_group_prealloc can be configured via | ||
2835 | * /sys/fs/ext4/<partition>/mb_group_prealloc | 2866 | * /sys/fs/ext4/<partition>/mb_group_prealloc |
2836 | * | 2867 | * |
2837 | * XXX: should we try to preallocate more than the group has now? | 2868 | * XXX: should we try to preallocate more than the group has now? |
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) | |||
2842 | struct ext4_locality_group *lg = ac->ac_lg; | 2873 | struct ext4_locality_group *lg = ac->ac_lg; |
2843 | 2874 | ||
2844 | BUG_ON(lg == NULL); | 2875 | BUG_ON(lg == NULL); |
2845 | if (EXT4_SB(sb)->s_stripe) | 2876 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; |
2846 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; | ||
2847 | else | ||
2848 | ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; | ||
2849 | mb_debug(1, "#%u: goal %u blocks for locality group\n", | 2877 | mb_debug(1, "#%u: goal %u blocks for locality group\n", |
2850 | current->pid, ac->ac_g_ex.fe_len); | 2878 | current->pid, ac->ac_g_ex.fe_len); |
2851 | } | 2879 | } |
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, | |||
3001 | 3029 | ||
3002 | if (start + size <= ac->ac_o_ex.fe_logical && | 3030 | if (start + size <= ac->ac_o_ex.fe_logical && |
3003 | start > ac->ac_o_ex.fe_logical) { | 3031 | start > ac->ac_o_ex.fe_logical) { |
3004 | printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", | 3032 | ext4_msg(ac->ac_sb, KERN_ERR, |
3005 | (unsigned long) start, (unsigned long) size, | 3033 | "start %lu, size %lu, fe_logical %lu", |
3006 | (unsigned long) ac->ac_o_ex.fe_logical); | 3034 | (unsigned long) start, (unsigned long) size, |
3035 | (unsigned long) ac->ac_o_ex.fe_logical); | ||
3007 | } | 3036 | } |
3008 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && | 3037 | BUG_ON(start + size <= ac->ac_o_ex.fe_logical && |
3009 | start > ac->ac_o_ex.fe_logical); | 3038 | start > ac->ac_o_ex.fe_logical); |
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, | |||
3262 | 3291 | ||
3263 | while (n) { | 3292 | while (n) { |
3264 | entry = rb_entry(n, struct ext4_free_data, node); | 3293 | entry = rb_entry(n, struct ext4_free_data, node); |
3265 | mb_set_bits(bitmap, entry->start_blk, entry->count); | 3294 | ext4_set_bits(bitmap, entry->start_blk, entry->count); |
3266 | n = rb_next(n); | 3295 | n = rb_next(n); |
3267 | } | 3296 | } |
3268 | return; | 3297 | return; |
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, | |||
3304 | if (unlikely(len == 0)) | 3333 | if (unlikely(len == 0)) |
3305 | continue; | 3334 | continue; |
3306 | BUG_ON(groupnr != group); | 3335 | BUG_ON(groupnr != group); |
3307 | mb_set_bits(bitmap, start, len); | 3336 | ext4_set_bits(bitmap, start, len); |
3308 | preallocated += len; | 3337 | preallocated += len; |
3309 | count++; | 3338 | count++; |
3310 | } | 3339 | } |
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3584 | bit = next + 1; | 3613 | bit = next + 1; |
3585 | } | 3614 | } |
3586 | if (free != pa->pa_free) { | 3615 | if (free != pa->pa_free) { |
3587 | printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", | 3616 | ext4_msg(e4b->bd_sb, KERN_CRIT, |
3588 | pa, (unsigned long) pa->pa_lstart, | 3617 | "pa %p: logic %lu, phys. %lu, len %lu", |
3589 | (unsigned long) pa->pa_pstart, | 3618 | pa, (unsigned long) pa->pa_lstart, |
3590 | (unsigned long) pa->pa_len); | 3619 | (unsigned long) pa->pa_pstart, |
3620 | (unsigned long) pa->pa_len); | ||
3591 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", | 3621 | ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", |
3592 | free, pa->pa_free); | 3622 | free, pa->pa_free); |
3593 | /* | 3623 | /* |
@@ -3775,7 +3805,8 @@ repeat: | |||
3775 | * use preallocation while we're discarding it */ | 3805 | * use preallocation while we're discarding it */ |
3776 | spin_unlock(&pa->pa_lock); | 3806 | spin_unlock(&pa->pa_lock); |
3777 | spin_unlock(&ei->i_prealloc_lock); | 3807 | spin_unlock(&ei->i_prealloc_lock); |
3778 | printk(KERN_ERR "uh-oh! used pa while discarding\n"); | 3808 | ext4_msg(sb, KERN_ERR, |
3809 | "uh-oh! used pa while discarding"); | ||
3779 | WARN_ON(1); | 3810 | WARN_ON(1); |
3780 | schedule_timeout_uninterruptible(HZ); | 3811 | schedule_timeout_uninterruptible(HZ); |
3781 | goto repeat; | 3812 | goto repeat; |
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3852 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) | 3883 | (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) |
3853 | return; | 3884 | return; |
3854 | 3885 | ||
3855 | printk(KERN_ERR "EXT4-fs: Can't allocate:" | 3886 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:" |
3856 | " Allocation context details:\n"); | 3887 | " Allocation context details:"); |
3857 | printk(KERN_ERR "EXT4-fs: status %d flags %d\n", | 3888 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d", |
3858 | ac->ac_status, ac->ac_flags); | 3889 | ac->ac_status, ac->ac_flags); |
3859 | printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " | 3890 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, " |
3860 | "best %lu/%lu/%lu@%lu cr %d\n", | 3891 | "goal %lu/%lu/%lu@%lu, " |
3892 | "best %lu/%lu/%lu@%lu cr %d", | ||
3861 | (unsigned long)ac->ac_o_ex.fe_group, | 3893 | (unsigned long)ac->ac_o_ex.fe_group, |
3862 | (unsigned long)ac->ac_o_ex.fe_start, | 3894 | (unsigned long)ac->ac_o_ex.fe_start, |
3863 | (unsigned long)ac->ac_o_ex.fe_len, | 3895 | (unsigned long)ac->ac_o_ex.fe_len, |
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) | |||
3871 | (unsigned long)ac->ac_b_ex.fe_len, | 3903 | (unsigned long)ac->ac_b_ex.fe_len, |
3872 | (unsigned long)ac->ac_b_ex.fe_logical, | 3904 | (unsigned long)ac->ac_b_ex.fe_logical, |
3873 | (int)ac->ac_criteria); | 3905 | (int)ac->ac_criteria); |
3874 | printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, | 3906 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found", |
3875 | ac->ac_found); | 3907 | ac->ac_ex_scanned, ac->ac_found); |
3876 | printk(KERN_ERR "EXT4-fs: groups: \n"); | 3908 | ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: "); |
3877 | ngroups = ext4_get_groups_count(sb); | 3909 | ngroups = ext4_get_groups_count(sb); |
3878 | for (i = 0; i < ngroups; i++) { | 3910 | for (i = 0; i < ngroups; i++) { |
3879 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); | 3911 | struct ext4_group_info *grp = ext4_get_group_info(sb, i); |
@@ -4637,7 +4669,7 @@ do_more: | |||
4637 | } | 4669 | } |
4638 | ext4_mark_super_dirty(sb); | 4670 | ext4_mark_super_dirty(sb); |
4639 | error_return: | 4671 | error_return: |
4640 | if (freed) | 4672 | if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) |
4641 | dquot_free_block(inode, freed); | 4673 | dquot_free_block(inode, freed); |
4642 | brelse(bitmap_bh); | 4674 | brelse(bitmap_bh); |
4643 | ext4_std_error(sb, err); | 4675 | ext4_std_error(sb, err); |
@@ -4645,7 +4677,7 @@ error_return: | |||
4645 | } | 4677 | } |
4646 | 4678 | ||
4647 | /** | 4679 | /** |
4648 | * ext4_add_groupblocks() -- Add given blocks to an existing group | 4680 | * ext4_group_add_blocks() -- Add given blocks to an existing group |
4649 | * @handle: handle to this transaction | 4681 | * @handle: handle to this transaction |
4650 | * @sb: super block | 4682 | * @sb: super block |
4651 | * @block: start physcial block to add to the block group | 4683 | * @block: start physcial block to add to the block group |
@@ -4653,7 +4685,7 @@ error_return: | |||
4653 | * | 4685 | * |
4654 | * This marks the blocks as free in the bitmap and buddy. | 4686 | * This marks the blocks as free in the bitmap and buddy. |
4655 | */ | 4687 | */ |
4656 | void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | 4688 | int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, |
4657 | ext4_fsblk_t block, unsigned long count) | 4689 | ext4_fsblk_t block, unsigned long count) |
4658 | { | 4690 | { |
4659 | struct buffer_head *bitmap_bh = NULL; | 4691 | struct buffer_head *bitmap_bh = NULL; |
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4666 | struct ext4_buddy e4b; | 4698 | struct ext4_buddy e4b; |
4667 | int err = 0, ret, blk_free_count; | 4699 | int err = 0, ret, blk_free_count; |
4668 | ext4_grpblk_t blocks_freed; | 4700 | ext4_grpblk_t blocks_freed; |
4669 | struct ext4_group_info *grp; | ||
4670 | 4701 | ||
4671 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); | 4702 | ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); |
4672 | 4703 | ||
4704 | if (count == 0) | ||
4705 | return 0; | ||
4706 | |||
4673 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); | 4707 | ext4_get_group_no_and_offset(sb, block, &block_group, &bit); |
4674 | grp = ext4_get_group_info(sb, block_group); | ||
4675 | /* | 4708 | /* |
4676 | * Check to see if we are freeing blocks across a group | 4709 | * Check to see if we are freeing blocks across a group |
4677 | * boundary. | 4710 | * boundary. |
4678 | */ | 4711 | */ |
4679 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) | 4712 | if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { |
4713 | ext4_warning(sb, "too much blocks added to group %u\n", | ||
4714 | block_group); | ||
4715 | err = -EINVAL; | ||
4680 | goto error_return; | 4716 | goto error_return; |
4717 | } | ||
4681 | 4718 | ||
4682 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); | 4719 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
4683 | if (!bitmap_bh) | 4720 | if (!bitmap_bh) { |
4721 | err = -EIO; | ||
4684 | goto error_return; | 4722 | goto error_return; |
4723 | } | ||
4724 | |||
4685 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); | 4725 | desc = ext4_get_group_desc(sb, block_group, &gd_bh); |
4686 | if (!desc) | 4726 | if (!desc) { |
4727 | err = -EIO; | ||
4687 | goto error_return; | 4728 | goto error_return; |
4729 | } | ||
4688 | 4730 | ||
4689 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || | 4731 | if (in_range(ext4_block_bitmap(sb, desc), block, count) || |
4690 | in_range(ext4_inode_bitmap(sb, desc), block, count) || | 4732 | in_range(ext4_inode_bitmap(sb, desc), block, count) || |
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4694 | ext4_error(sb, "Adding blocks in system zones - " | 4736 | ext4_error(sb, "Adding blocks in system zones - " |
4695 | "Block = %llu, count = %lu", | 4737 | "Block = %llu, count = %lu", |
4696 | block, count); | 4738 | block, count); |
4739 | err = -EINVAL; | ||
4697 | goto error_return; | 4740 | goto error_return; |
4698 | } | 4741 | } |
4699 | 4742 | ||
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, | |||
4762 | error_return: | 4805 | error_return: |
4763 | brelse(bitmap_bh); | 4806 | brelse(bitmap_bh); |
4764 | ext4_std_error(sb, err); | 4807 | ext4_std_error(sb, err); |
4765 | return; | 4808 | return err; |
4766 | } | 4809 | } |
4767 | 4810 | ||
4768 | /** | 4811 | /** |
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4782 | { | 4825 | { |
4783 | struct ext4_free_extent ex; | 4826 | struct ext4_free_extent ex; |
4784 | 4827 | ||
4828 | trace_ext4_trim_extent(sb, group, start, count); | ||
4829 | |||
4785 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); | 4830 | assert_spin_locked(ext4_group_lock_ptr(sb, group)); |
4786 | 4831 | ||
4787 | ex.fe_start = start; | 4832 | ex.fe_start = start; |
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count, | |||
4802 | /** | 4847 | /** |
4803 | * ext4_trim_all_free -- function to trim all free space in alloc. group | 4848 | * ext4_trim_all_free -- function to trim all free space in alloc. group |
4804 | * @sb: super block for file system | 4849 | * @sb: super block for file system |
4805 | * @e4b: ext4 buddy | 4850 | * @group: group to be trimmed |
4806 | * @start: first group block to examine | 4851 | * @start: first group block to examine |
4807 | * @max: last group block to examine | 4852 | * @max: last group block to examine |
4808 | * @minblocks: minimum extent block count | 4853 | * @minblocks: minimum extent block count |
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4823 | ext4_grpblk_t minblocks) | 4868 | ext4_grpblk_t minblocks) |
4824 | { | 4869 | { |
4825 | void *bitmap; | 4870 | void *bitmap; |
4826 | ext4_grpblk_t next, count = 0; | 4871 | ext4_grpblk_t next, count = 0, free_count = 0; |
4827 | struct ext4_buddy e4b; | 4872 | struct ext4_buddy e4b; |
4828 | int ret; | 4873 | int ret; |
4829 | 4874 | ||
4875 | trace_ext4_trim_all_free(sb, group, start, max); | ||
4876 | |||
4830 | ret = ext4_mb_load_buddy(sb, group, &e4b); | 4877 | ret = ext4_mb_load_buddy(sb, group, &e4b); |
4831 | if (ret) { | 4878 | if (ret) { |
4832 | ext4_error(sb, "Error in loading buddy " | 4879 | ext4_error(sb, "Error in loading buddy " |
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4836 | bitmap = e4b.bd_bitmap; | 4883 | bitmap = e4b.bd_bitmap; |
4837 | 4884 | ||
4838 | ext4_lock_group(sb, group); | 4885 | ext4_lock_group(sb, group); |
4886 | if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && | ||
4887 | minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) | ||
4888 | goto out; | ||
4889 | |||
4839 | start = (e4b.bd_info->bb_first_free > start) ? | 4890 | start = (e4b.bd_info->bb_first_free > start) ? |
4840 | e4b.bd_info->bb_first_free : start; | 4891 | e4b.bd_info->bb_first_free : start; |
4841 | 4892 | ||
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4850 | next - start, group, &e4b); | 4901 | next - start, group, &e4b); |
4851 | count += next - start; | 4902 | count += next - start; |
4852 | } | 4903 | } |
4904 | free_count += next - start; | ||
4853 | start = next + 1; | 4905 | start = next + 1; |
4854 | 4906 | ||
4855 | if (fatal_signal_pending(current)) { | 4907 | if (fatal_signal_pending(current)) { |
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, | |||
4863 | ext4_lock_group(sb, group); | 4915 | ext4_lock_group(sb, group); |
4864 | } | 4916 | } |
4865 | 4917 | ||
4866 | if ((e4b.bd_info->bb_free - count) < minblocks) | 4918 | if ((e4b.bd_info->bb_free - free_count) < minblocks) |
4867 | break; | 4919 | break; |
4868 | } | 4920 | } |
4921 | |||
4922 | if (!ret) | ||
4923 | EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); | ||
4924 | out: | ||
4869 | ext4_unlock_group(sb, group); | 4925 | ext4_unlock_group(sb, group); |
4870 | ext4_mb_unload_buddy(&e4b); | 4926 | ext4_mb_unload_buddy(&e4b); |
4871 | 4927 | ||
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4904 | 4960 | ||
4905 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) | 4961 | if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) |
4906 | return -EINVAL; | 4962 | return -EINVAL; |
4963 | if (start + len <= first_data_blk) | ||
4964 | goto out; | ||
4907 | if (start < first_data_blk) { | 4965 | if (start < first_data_blk) { |
4908 | len -= first_data_blk - start; | 4966 | len -= first_data_blk - start; |
4909 | start = first_data_blk; | 4967 | start = first_data_blk; |
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
4952 | } | 5010 | } |
4953 | range->len = trimmed * sb->s_blocksize; | 5011 | range->len = trimmed * sb->s_blocksize; |
4954 | 5012 | ||
5013 | if (!ret) | ||
5014 | atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen); | ||
5015 | |||
5016 | out: | ||
4955 | return ret; | 5017 | return ret; |
4956 | } | 5018 | } |
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 20b5e7bfebd1..9d4a636b546c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h | |||
@@ -187,7 +187,6 @@ struct ext4_allocation_context { | |||
187 | __u16 ac_flags; /* allocation hints */ | 187 | __u16 ac_flags; /* allocation hints */ |
188 | __u8 ac_status; | 188 | __u8 ac_status; |
189 | __u8 ac_criteria; | 189 | __u8 ac_criteria; |
190 | __u8 ac_repeats; | ||
191 | __u8 ac_2order; /* if request is to allocate 2^N blocks and | 190 | __u8 ac_2order; /* if request is to allocate 2^N blocks and |
192 | * N > 0, the field stores N, otherwise 0 */ | 191 | * N > 0, the field stores N, otherwise 0 */ |
193 | __u8 ac_op; /* operation, for history only */ | 192 | __u8 ac_op; /* operation, for history only */ |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8c9babac43dc..565a154e22d4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent | |||
289 | while (len--) printk("%c", *name++); | 289 | while (len--) printk("%c", *name++); |
290 | ext4fs_dirhash(de->name, de->name_len, &h); | 290 | ext4fs_dirhash(de->name, de->name_len, &h); |
291 | printk(":%x.%u ", h.hash, | 291 | printk(":%x.%u ", h.hash, |
292 | ((char *) de - base)); | 292 | (unsigned) ((char *) de - base)); |
293 | } | 293 | } |
294 | space += EXT4_DIR_REC_LEN(de->name_len); | 294 | space += EXT4_DIR_REC_LEN(de->name_len); |
295 | names++; | 295 | names++; |
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q | |||
1013 | 1013 | ||
1014 | *err = -ENOENT; | 1014 | *err = -ENOENT; |
1015 | errout: | 1015 | errout: |
1016 | dxtrace(printk(KERN_DEBUG "%s not found\n", name)); | 1016 | dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); |
1017 | dx_release (frames); | 1017 | dx_release (frames); |
1018 | return NULL; | 1018 | return NULL; |
1019 | } | 1019 | } |
@@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) | |||
1985 | if (!list_empty(&EXT4_I(inode)->i_orphan)) | 1985 | if (!list_empty(&EXT4_I(inode)->i_orphan)) |
1986 | goto out_unlock; | 1986 | goto out_unlock; |
1987 | 1987 | ||
1988 | /* Orphan handling is only valid for files with data blocks | 1988 | /* |
1989 | * being truncated, or files being unlinked. */ | 1989 | * Orphan handling is only valid for files with data blocks |
1990 | 1990 | * being truncated, or files being unlinked. Note that we either | |
1991 | /* @@@ FIXME: Observation from aviro: | 1991 | * hold i_mutex, or the inode can not be referenced from outside, |
1992 | * I think I can trigger J_ASSERT in ext4_orphan_add(). We block | 1992 | * so i_nlink should not be bumped due to race |
1993 | * here (on s_orphan_lock), so race with ext4_link() which might bump | ||
1994 | * ->i_nlink. For, say it, character device. Not a regular file, | ||
1995 | * not a directory, not a symlink and ->i_nlink > 0. | ||
1996 | * | ||
1997 | * tytso, 4/25/2009: I'm not sure how that could happen; | ||
1998 | * shouldn't the fs core protect us from these sort of | ||
1999 | * unlink()/link() races? | ||
2000 | */ | 1993 | */ |
2001 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 1994 | J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || |
2002 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); | 1995 | S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7bb8f76d470a..430c401d0895 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io, | |||
285 | io_end = ext4_init_io_end(inode, GFP_NOFS); | 285 | io_end = ext4_init_io_end(inode, GFP_NOFS); |
286 | if (!io_end) | 286 | if (!io_end) |
287 | return -ENOMEM; | 287 | return -ENOMEM; |
288 | do { | 288 | bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); |
289 | bio = bio_alloc(GFP_NOIO, nvecs); | ||
290 | nvecs >>= 1; | ||
291 | } while (bio == NULL); | ||
292 | |||
293 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); | 289 | bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); |
294 | bio->bi_bdev = bh->b_bdev; | 290 | bio->bi_bdev = bh->b_bdev; |
295 | bio->bi_private = io->io_end = io_end; | 291 | bio->bi_private = io->io_end = io_end; |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 80bbc9c60c24..707d3f16f7ce 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -16,6 +16,35 @@ | |||
16 | 16 | ||
17 | #include "ext4_jbd2.h" | 17 | #include "ext4_jbd2.h" |
18 | 18 | ||
19 | int ext4_resize_begin(struct super_block *sb) | ||
20 | { | ||
21 | int ret = 0; | ||
22 | |||
23 | if (!capable(CAP_SYS_RESOURCE)) | ||
24 | return -EPERM; | ||
25 | |||
26 | /* | ||
27 | * We are not allowed to do online-resizing on a filesystem mounted | ||
28 | * with error, because it can destroy the filesystem easily. | ||
29 | */ | ||
30 | if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { | ||
31 | ext4_warning(sb, "There are errors in the filesystem, " | ||
32 | "so online resizing is not allowed\n"); | ||
33 | return -EPERM; | ||
34 | } | ||
35 | |||
36 | if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags)) | ||
37 | ret = -EBUSY; | ||
38 | |||
39 | return ret; | ||
40 | } | ||
41 | |||
42 | void ext4_resize_end(struct super_block *sb) | ||
43 | { | ||
44 | clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags); | ||
45 | smp_mb__after_clear_bit(); | ||
46 | } | ||
47 | |||
19 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) | 48 | #define outside(b, first, last) ((b) < (first) || (b) >= (last)) |
20 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) | 49 | #define inside(b, first, last) ((b) >= (first) && (b) < (last)) |
21 | 50 | ||
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
118 | brelse(bh); | 147 | brelse(bh); |
119 | bh = ERR_PTR(err); | 148 | bh = ERR_PTR(err); |
120 | } else { | 149 | } else { |
121 | lock_buffer(bh); | ||
122 | memset(bh->b_data, 0, sb->s_blocksize); | 150 | memset(bh->b_data, 0, sb->s_blocksize); |
123 | set_buffer_uptodate(bh); | 151 | set_buffer_uptodate(bh); |
124 | unlock_buffer(bh); | ||
125 | } | 152 | } |
126 | 153 | ||
127 | return bh; | 154 | return bh; |
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, | |||
132 | * If that fails, restart the transaction & regain write access for the | 159 | * If that fails, restart the transaction & regain write access for the |
133 | * buffer head which is used for block_bitmap modifications. | 160 | * buffer head which is used for block_bitmap modifications. |
134 | */ | 161 | */ |
135 | static int extend_or_restart_transaction(handle_t *handle, int thresh, | 162 | static int extend_or_restart_transaction(handle_t *handle, int thresh) |
136 | struct buffer_head *bh) | ||
137 | { | 163 | { |
138 | int err; | 164 | int err; |
139 | 165 | ||
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, | |||
144 | if (err < 0) | 170 | if (err < 0) |
145 | return err; | 171 | return err; |
146 | if (err) { | 172 | if (err) { |
147 | if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) | 173 | err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA); |
148 | return err; | 174 | if (err) |
149 | if ((err = ext4_journal_get_write_access(handle, bh))) | ||
150 | return err; | 175 | return err; |
151 | } | 176 | } |
152 | 177 | ||
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
181 | if (IS_ERR(handle)) | 206 | if (IS_ERR(handle)) |
182 | return PTR_ERR(handle); | 207 | return PTR_ERR(handle); |
183 | 208 | ||
184 | mutex_lock(&sbi->s_resize_lock); | 209 | BUG_ON(input->group != sbi->s_groups_count); |
185 | if (input->group != sbi->s_groups_count) { | ||
186 | err = -EBUSY; | ||
187 | goto exit_journal; | ||
188 | } | ||
189 | |||
190 | if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { | ||
191 | err = PTR_ERR(bh); | ||
192 | goto exit_journal; | ||
193 | } | ||
194 | |||
195 | if (ext4_bg_has_super(sb, input->group)) { | ||
196 | ext4_debug("mark backup superblock %#04llx (+0)\n", start); | ||
197 | ext4_set_bit(0, bh->b_data); | ||
198 | } | ||
199 | 210 | ||
200 | /* Copy all of the GDT blocks into the backup in this group */ | 211 | /* Copy all of the GDT blocks into the backup in this group */ |
201 | for (i = 0, bit = 1, block = start + 1; | 212 | for (i = 0, bit = 1, block = start + 1; |
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
203 | struct buffer_head *gdb; | 214 | struct buffer_head *gdb; |
204 | 215 | ||
205 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); | 216 | ext4_debug("update backup group %#04llx (+%d)\n", block, bit); |
206 | 217 | err = extend_or_restart_transaction(handle, 1); | |
207 | if ((err = extend_or_restart_transaction(handle, 1, bh))) | 218 | if (err) |
208 | goto exit_bh; | 219 | goto exit_journal; |
209 | 220 | ||
210 | gdb = sb_getblk(sb, block); | 221 | gdb = sb_getblk(sb, block); |
211 | if (!gdb) { | 222 | if (!gdb) { |
212 | err = -EIO; | 223 | err = -EIO; |
213 | goto exit_bh; | 224 | goto exit_journal; |
214 | } | 225 | } |
215 | if ((err = ext4_journal_get_write_access(handle, gdb))) { | 226 | if ((err = ext4_journal_get_write_access(handle, gdb))) { |
216 | brelse(gdb); | 227 | brelse(gdb); |
217 | goto exit_bh; | 228 | goto exit_journal; |
218 | } | 229 | } |
219 | lock_buffer(gdb); | ||
220 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); | 230 | memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); |
221 | set_buffer_uptodate(gdb); | 231 | set_buffer_uptodate(gdb); |
222 | unlock_buffer(gdb); | ||
223 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); | 232 | err = ext4_handle_dirty_metadata(handle, NULL, gdb); |
224 | if (unlikely(err)) { | 233 | if (unlikely(err)) { |
225 | brelse(gdb); | 234 | brelse(gdb); |
226 | goto exit_bh; | 235 | goto exit_journal; |
227 | } | 236 | } |
228 | ext4_set_bit(bit, bh->b_data); | ||
229 | brelse(gdb); | 237 | brelse(gdb); |
230 | } | 238 | } |
231 | 239 | ||
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
235 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, | 243 | err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, |
236 | GFP_NOFS); | 244 | GFP_NOFS); |
237 | if (err) | 245 | if (err) |
238 | goto exit_bh; | 246 | goto exit_journal; |
239 | for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) | 247 | |
240 | ext4_set_bit(bit, bh->b_data); | 248 | err = extend_or_restart_transaction(handle, 2); |
249 | if (err) | ||
250 | goto exit_journal; | ||
251 | |||
252 | bh = bclean(handle, sb, input->block_bitmap); | ||
253 | if (IS_ERR(bh)) { | ||
254 | err = PTR_ERR(bh); | ||
255 | goto exit_journal; | ||
256 | } | ||
257 | |||
258 | if (ext4_bg_has_super(sb, input->group)) { | ||
259 | ext4_debug("mark backup group tables %#04llx (+0)\n", start); | ||
260 | ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); | ||
261 | } | ||
241 | 262 | ||
242 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, | 263 | ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, |
243 | input->block_bitmap - start); | 264 | input->block_bitmap - start); |
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb, | |||
253 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); | 274 | err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); |
254 | if (err) | 275 | if (err) |
255 | goto exit_bh; | 276 | goto exit_bh; |
256 | for (i = 0, bit = input->inode_table - start; | 277 | ext4_set_bits(bh->b_data, input->inode_table - start, |
257 | i < sbi->s_itb_per_group; i++, bit++) | 278 | sbi->s_itb_per_group); |
258 | ext4_set_bit(bit, bh->b_data); | ||
259 | 279 | ||
260 | if ((err = extend_or_restart_transaction(handle, 2, bh))) | ||
261 | goto exit_bh; | ||
262 | 280 | ||
263 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, | 281 | ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, |
264 | bh->b_data); | 282 | bh->b_data); |
@@ -285,7 +303,6 @@ exit_bh: | |||
285 | brelse(bh); | 303 | brelse(bh); |
286 | 304 | ||
287 | exit_journal: | 305 | exit_journal: |
288 | mutex_unlock(&sbi->s_resize_lock); | ||
289 | if ((err2 = ext4_journal_stop(handle)) && !err) | 306 | if ((err2 = ext4_journal_stop(handle)) && !err) |
290 | err = err2; | 307 | err = err2; |
291 | 308 | ||
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb, | |||
377 | * fail once we start modifying the data on disk, because JBD has no rollback. | 394 | * fail once we start modifying the data on disk, because JBD has no rollback. |
378 | */ | 395 | */ |
379 | static int add_new_gdb(handle_t *handle, struct inode *inode, | 396 | static int add_new_gdb(handle_t *handle, struct inode *inode, |
380 | struct ext4_new_group_data *input, | 397 | ext4_group_t group) |
381 | struct buffer_head **primary) | ||
382 | { | 398 | { |
383 | struct super_block *sb = inode->i_sb; | 399 | struct super_block *sb = inode->i_sb; |
384 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; | 400 | struct ext4_super_block *es = EXT4_SB(sb)->s_es; |
385 | unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); | 401 | unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); |
386 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; | 402 | ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; |
387 | struct buffer_head **o_group_desc, **n_group_desc; | 403 | struct buffer_head **o_group_desc, **n_group_desc; |
388 | struct buffer_head *dind; | 404 | struct buffer_head *dind; |
405 | struct buffer_head *gdb_bh; | ||
389 | int gdbackups; | 406 | int gdbackups; |
390 | struct ext4_iloc iloc; | 407 | struct ext4_iloc iloc; |
391 | __le32 *data; | 408 | __le32 *data; |
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
408 | return -EPERM; | 425 | return -EPERM; |
409 | } | 426 | } |
410 | 427 | ||
411 | *primary = sb_bread(sb, gdblock); | 428 | gdb_bh = sb_bread(sb, gdblock); |
412 | if (!*primary) | 429 | if (!gdb_bh) |
413 | return -EIO; | 430 | return -EIO; |
414 | 431 | ||
415 | if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { | 432 | gdbackups = verify_reserved_gdb(sb, gdb_bh); |
433 | if (gdbackups < 0) { | ||
416 | err = gdbackups; | 434 | err = gdbackups; |
417 | goto exit_bh; | 435 | goto exit_bh; |
418 | } | 436 | } |
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
427 | data = (__le32 *)dind->b_data; | 445 | data = (__le32 *)dind->b_data; |
428 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { | 446 | if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { |
429 | ext4_warning(sb, "new group %u GDT block %llu not reserved", | 447 | ext4_warning(sb, "new group %u GDT block %llu not reserved", |
430 | input->group, gdblock); | 448 | group, gdblock); |
431 | err = -EINVAL; | 449 | err = -EINVAL; |
432 | goto exit_dind; | 450 | goto exit_dind; |
433 | } | 451 | } |
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
436 | if (unlikely(err)) | 454 | if (unlikely(err)) |
437 | goto exit_dind; | 455 | goto exit_dind; |
438 | 456 | ||
439 | err = ext4_journal_get_write_access(handle, *primary); | 457 | err = ext4_journal_get_write_access(handle, gdb_bh); |
440 | if (unlikely(err)) | 458 | if (unlikely(err)) |
441 | goto exit_sbh; | 459 | goto exit_sbh; |
442 | 460 | ||
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
449 | if (unlikely(err)) | 467 | if (unlikely(err)) |
450 | goto exit_dindj; | 468 | goto exit_dindj; |
451 | 469 | ||
452 | n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), | 470 | n_group_desc = ext4_kvmalloc((gdb_num + 1) * |
453 | GFP_NOFS); | 471 | sizeof(struct buffer_head *), |
472 | GFP_NOFS); | ||
454 | if (!n_group_desc) { | 473 | if (!n_group_desc) { |
455 | err = -ENOMEM; | 474 | err = -ENOMEM; |
456 | ext4_warning(sb, | 475 | ext4_warning(sb, "not enough memory for %lu groups", |
457 | "not enough memory for %lu groups", gdb_num + 1); | 476 | gdb_num + 1); |
458 | goto exit_inode; | 477 | goto exit_inode; |
459 | } | 478 | } |
460 | 479 | ||
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
475 | } | 494 | } |
476 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; | 495 | inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; |
477 | ext4_mark_iloc_dirty(handle, inode, &iloc); | 496 | ext4_mark_iloc_dirty(handle, inode, &iloc); |
478 | memset((*primary)->b_data, 0, sb->s_blocksize); | 497 | memset(gdb_bh->b_data, 0, sb->s_blocksize); |
479 | err = ext4_handle_dirty_metadata(handle, NULL, *primary); | 498 | err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh); |
480 | if (unlikely(err)) { | 499 | if (unlikely(err)) { |
481 | ext4_std_error(sb, err); | 500 | ext4_std_error(sb, err); |
482 | goto exit_inode; | 501 | goto exit_inode; |
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
486 | o_group_desc = EXT4_SB(sb)->s_group_desc; | 505 | o_group_desc = EXT4_SB(sb)->s_group_desc; |
487 | memcpy(n_group_desc, o_group_desc, | 506 | memcpy(n_group_desc, o_group_desc, |
488 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); | 507 | EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); |
489 | n_group_desc[gdb_num] = *primary; | 508 | n_group_desc[gdb_num] = gdb_bh; |
490 | EXT4_SB(sb)->s_group_desc = n_group_desc; | 509 | EXT4_SB(sb)->s_group_desc = n_group_desc; |
491 | EXT4_SB(sb)->s_gdb_count++; | 510 | EXT4_SB(sb)->s_gdb_count++; |
492 | kfree(o_group_desc); | 511 | ext4_kvfree(o_group_desc); |
493 | 512 | ||
494 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); | 513 | le16_add_cpu(&es->s_reserved_gdt_blocks, -1); |
495 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); | 514 | err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); |
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |||
499 | return err; | 518 | return err; |
500 | 519 | ||
501 | exit_inode: | 520 | exit_inode: |
521 | ext4_kvfree(n_group_desc); | ||
502 | /* ext4_handle_release_buffer(handle, iloc.bh); */ | 522 | /* ext4_handle_release_buffer(handle, iloc.bh); */ |
503 | brelse(iloc.bh); | 523 | brelse(iloc.bh); |
504 | exit_dindj: | 524 | exit_dindj: |
@@ -508,7 +528,7 @@ exit_sbh: | |||
508 | exit_dind: | 528 | exit_dind: |
509 | brelse(dind); | 529 | brelse(dind); |
510 | exit_bh: | 530 | exit_bh: |
511 | brelse(*primary); | 531 | brelse(gdb_bh); |
512 | 532 | ||
513 | ext4_debug("leaving with error %d\n", err); | 533 | ext4_debug("leaving with error %d\n", err); |
514 | return err; | 534 | return err; |
@@ -528,7 +548,7 @@ exit_bh: | |||
528 | * backup GDT blocks are stored in their reserved primary GDT block. | 548 | * backup GDT blocks are stored in their reserved primary GDT block. |
529 | */ | 549 | */ |
530 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | 550 | static int reserve_backup_gdb(handle_t *handle, struct inode *inode, |
531 | struct ext4_new_group_data *input) | 551 | ext4_group_t group) |
532 | { | 552 | { |
533 | struct super_block *sb = inode->i_sb; | 553 | struct super_block *sb = inode->i_sb; |
534 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); | 554 | int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); |
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, | |||
599 | * Finally we can add each of the reserved backup GDT blocks from | 619 | * Finally we can add each of the reserved backup GDT blocks from |
600 | * the new group to its reserved primary GDT block. | 620 | * the new group to its reserved primary GDT block. |
601 | */ | 621 | */ |
602 | blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); | 622 | blk = group * EXT4_BLOCKS_PER_GROUP(sb); |
603 | for (i = 0; i < reserved_gdb; i++) { | 623 | for (i = 0; i < reserved_gdb; i++) { |
604 | int err2; | 624 | int err2; |
605 | data = (__le32 *)primary[i]->b_data; | 625 | data = (__le32 *)primary[i]->b_data; |
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
799 | goto exit_put; | 819 | goto exit_put; |
800 | } | 820 | } |
801 | 821 | ||
802 | mutex_lock(&sbi->s_resize_lock); | ||
803 | if (input->group != sbi->s_groups_count) { | ||
804 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
805 | err = -EBUSY; | ||
806 | goto exit_journal; | ||
807 | } | ||
808 | |||
809 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) | 822 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) |
810 | goto exit_journal; | 823 | goto exit_journal; |
811 | 824 | ||
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
820 | if ((err = ext4_journal_get_write_access(handle, primary))) | 833 | if ((err = ext4_journal_get_write_access(handle, primary))) |
821 | goto exit_journal; | 834 | goto exit_journal; |
822 | 835 | ||
823 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && | 836 | if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) { |
824 | (err = reserve_backup_gdb(handle, inode, input))) | 837 | err = reserve_backup_gdb(handle, inode, input->group); |
838 | if (err) | ||
839 | goto exit_journal; | ||
840 | } | ||
841 | } else { | ||
842 | /* | ||
843 | * Note that we can access new group descriptor block safely | ||
844 | * only if add_new_gdb() succeeds. | ||
845 | */ | ||
846 | err = add_new_gdb(handle, inode, input->group); | ||
847 | if (err) | ||
825 | goto exit_journal; | 848 | goto exit_journal; |
826 | } else if ((err = add_new_gdb(handle, inode, input, &primary))) | 849 | primary = sbi->s_group_desc[gdb_num]; |
827 | goto exit_journal; | 850 | } |
828 | 851 | ||
829 | /* | 852 | /* |
830 | * OK, now we've set up the new group. Time to make it active. | 853 | * OK, now we've set up the new group. Time to make it active. |
831 | * | 854 | * |
832 | * We do not lock all allocations via s_resize_lock | ||
833 | * so we have to be safe wrt. concurrent accesses the group | 855 | * so we have to be safe wrt. concurrent accesses the group |
834 | * data. So we need to be careful to set all of the relevant | 856 | * data. So we need to be careful to set all of the relevant |
835 | * group descriptor data etc. *before* we enable the group. | 857 | * group descriptor data etc. *before* we enable the group. |
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
886 | * | 908 | * |
887 | * The precise rules we use are: | 909 | * The precise rules we use are: |
888 | * | 910 | * |
889 | * * Writers of s_groups_count *must* hold s_resize_lock | ||
890 | * AND | ||
891 | * * Writers must perform a smp_wmb() after updating all dependent | 911 | * * Writers must perform a smp_wmb() after updating all dependent |
892 | * data and before modifying the groups count | 912 | * data and before modifying the groups count |
893 | * | 913 | * |
894 | * * Readers must hold s_resize_lock over the access | ||
895 | * OR | ||
896 | * * Readers must perform an smp_rmb() after reading the groups count | 914 | * * Readers must perform an smp_rmb() after reading the groups count |
897 | * and before reading any dependent data. | 915 | * and before reading any dependent data. |
898 | * | 916 | * |
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
937 | ext4_handle_dirty_super(handle, sb); | 955 | ext4_handle_dirty_super(handle, sb); |
938 | 956 | ||
939 | exit_journal: | 957 | exit_journal: |
940 | mutex_unlock(&sbi->s_resize_lock); | ||
941 | if ((err2 = ext4_journal_stop(handle)) && !err) | 958 | if ((err2 = ext4_journal_stop(handle)) && !err) |
942 | err = err2; | 959 | err = err2; |
943 | if (!err) { | 960 | if (!err && primary) { |
944 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, | 961 | update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, |
945 | sizeof(struct ext4_super_block)); | 962 | sizeof(struct ext4_super_block)); |
946 | update_backups(sb, primary->b_blocknr, primary->b_data, | 963 | update_backups(sb, primary->b_blocknr, primary->b_data, |
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
969 | ext4_grpblk_t add; | 986 | ext4_grpblk_t add; |
970 | struct buffer_head *bh; | 987 | struct buffer_head *bh; |
971 | handle_t *handle; | 988 | handle_t *handle; |
972 | int err; | 989 | int err, err2; |
973 | ext4_group_t group; | 990 | ext4_group_t group; |
974 | 991 | ||
975 | /* We don't need to worry about locking wrt other resizers just | ||
976 | * yet: we're going to revalidate es->s_blocks_count after | ||
977 | * taking the s_resize_lock below. */ | ||
978 | o_blocks_count = ext4_blocks_count(es); | 992 | o_blocks_count = ext4_blocks_count(es); |
979 | 993 | ||
980 | if (test_opt(sb, DEBUG)) | 994 | if (test_opt(sb, DEBUG)) |
981 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", | 995 | printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n", |
982 | o_blocks_count, n_blocks_count); | 996 | o_blocks_count, n_blocks_count); |
983 | 997 | ||
984 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) | 998 | if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) |
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
995 | 1009 | ||
996 | if (n_blocks_count < o_blocks_count) { | 1010 | if (n_blocks_count < o_blocks_count) { |
997 | ext4_warning(sb, "can't shrink FS - resize aborted"); | 1011 | ext4_warning(sb, "can't shrink FS - resize aborted"); |
998 | return -EBUSY; | 1012 | return -EINVAL; |
999 | } | 1013 | } |
1000 | 1014 | ||
1001 | /* Handle the remaining blocks in the last group only. */ | 1015 | /* Handle the remaining blocks in the last group only. */ |
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1038 | goto exit_put; | 1052 | goto exit_put; |
1039 | } | 1053 | } |
1040 | 1054 | ||
1041 | mutex_lock(&EXT4_SB(sb)->s_resize_lock); | ||
1042 | if (o_blocks_count != ext4_blocks_count(es)) { | ||
1043 | ext4_warning(sb, "multiple resizers run on filesystem!"); | ||
1044 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1045 | ext4_journal_stop(handle); | ||
1046 | err = -EBUSY; | ||
1047 | goto exit_put; | ||
1048 | } | ||
1049 | |||
1050 | if ((err = ext4_journal_get_write_access(handle, | 1055 | if ((err = ext4_journal_get_write_access(handle, |
1051 | EXT4_SB(sb)->s_sbh))) { | 1056 | EXT4_SB(sb)->s_sbh))) { |
1052 | ext4_warning(sb, "error %d on journal write access", err); | 1057 | ext4_warning(sb, "error %d on journal write access", err); |
1053 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1054 | ext4_journal_stop(handle); | 1058 | ext4_journal_stop(handle); |
1055 | goto exit_put; | 1059 | goto exit_put; |
1056 | } | 1060 | } |
1057 | ext4_blocks_count_set(es, o_blocks_count + add); | 1061 | ext4_blocks_count_set(es, o_blocks_count + add); |
1058 | mutex_unlock(&EXT4_SB(sb)->s_resize_lock); | ||
1059 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, | 1062 | ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, |
1060 | o_blocks_count + add); | 1063 | o_blocks_count + add); |
1061 | /* We add the blocks to the bitmap and set the group need init bit */ | 1064 | /* We add the blocks to the bitmap and set the group need init bit */ |
1062 | ext4_add_groupblocks(handle, sb, o_blocks_count, add); | 1065 | err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); |
1063 | ext4_handle_dirty_super(handle, sb); | 1066 | ext4_handle_dirty_super(handle, sb); |
1064 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, | 1067 | ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, |
1065 | o_blocks_count + add); | 1068 | o_blocks_count + add); |
1066 | if ((err = ext4_journal_stop(handle))) | 1069 | err2 = ext4_journal_stop(handle); |
1070 | if (!err && err2) | ||
1071 | err = err2; | ||
1072 | |||
1073 | if (err) | ||
1067 | goto exit_put; | 1074 | goto exit_put; |
1068 | 1075 | ||
1069 | if (test_opt(sb, DEBUG)) | 1076 | if (test_opt(sb, DEBUG)) |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9ea71aa864b3..4687fea0c00f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = { | |||
110 | #define IS_EXT3_SB(sb) (0) | 110 | #define IS_EXT3_SB(sb) (0) |
111 | #endif | 111 | #endif |
112 | 112 | ||
113 | void *ext4_kvmalloc(size_t size, gfp_t flags) | ||
114 | { | ||
115 | void *ret; | ||
116 | |||
117 | ret = kmalloc(size, flags); | ||
118 | if (!ret) | ||
119 | ret = __vmalloc(size, flags, PAGE_KERNEL); | ||
120 | return ret; | ||
121 | } | ||
122 | |||
123 | void *ext4_kvzalloc(size_t size, gfp_t flags) | ||
124 | { | ||
125 | void *ret; | ||
126 | |||
127 | ret = kzalloc(size, flags); | ||
128 | if (!ret) | ||
129 | ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); | ||
130 | return ret; | ||
131 | } | ||
132 | |||
133 | void ext4_kvfree(void *ptr) | ||
134 | { | ||
135 | if (is_vmalloc_addr(ptr)) | ||
136 | vfree(ptr); | ||
137 | else | ||
138 | kfree(ptr); | ||
139 | |||
140 | } | ||
141 | |||
113 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, | 142 | ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, |
114 | struct ext4_group_desc *bg) | 143 | struct ext4_group_desc *bg) |
115 | { | 144 | { |
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) | |||
269 | journal_t *journal; | 298 | journal_t *journal; |
270 | handle_t *handle; | 299 | handle_t *handle; |
271 | 300 | ||
301 | trace_ext4_journal_start(sb, nblocks, _RET_IP_); | ||
272 | if (sb->s_flags & MS_RDONLY) | 302 | if (sb->s_flags & MS_RDONLY) |
273 | return ERR_PTR(-EROFS); | 303 | return ERR_PTR(-EROFS); |
274 | 304 | ||
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb) | |||
789 | 819 | ||
790 | for (i = 0; i < sbi->s_gdb_count; i++) | 820 | for (i = 0; i < sbi->s_gdb_count; i++) |
791 | brelse(sbi->s_group_desc[i]); | 821 | brelse(sbi->s_group_desc[i]); |
792 | kfree(sbi->s_group_desc); | 822 | ext4_kvfree(sbi->s_group_desc); |
793 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 823 | ext4_kvfree(sbi->s_flex_groups); |
794 | vfree(sbi->s_flex_groups); | ||
795 | else | ||
796 | kfree(sbi->s_flex_groups); | ||
797 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 824 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
798 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 825 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
799 | percpu_counter_destroy(&sbi->s_dirs_counter); | 826 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -1976,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb) | |||
1976 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << | 2003 | ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << |
1977 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; | 2004 | EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; |
1978 | size = flex_group_count * sizeof(struct flex_groups); | 2005 | size = flex_group_count * sizeof(struct flex_groups); |
1979 | sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); | 2006 | sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL); |
1980 | if (sbi->s_flex_groups == NULL) { | 2007 | if (sbi->s_flex_groups == NULL) { |
1981 | sbi->s_flex_groups = vzalloc(size); | 2008 | ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups", |
1982 | if (sbi->s_flex_groups == NULL) { | 2009 | flex_group_count); |
1983 | ext4_msg(sb, KERN_ERR, | 2010 | goto failed; |
1984 | "not enough memory for %u flex groups", | ||
1985 | flex_group_count); | ||
1986 | goto failed; | ||
1987 | } | ||
1988 | } | 2011 | } |
1989 | 2012 | ||
1990 | for (i = 0; i < sbi->s_groups_count; i++) { | 2013 | for (i = 0; i < sbi->s_groups_count; i++) { |
@@ -2383,17 +2406,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | |||
2383 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); | 2406 | unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); |
2384 | unsigned long stripe_width = | 2407 | unsigned long stripe_width = |
2385 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); | 2408 | le32_to_cpu(sbi->s_es->s_raid_stripe_width); |
2409 | int ret; | ||
2386 | 2410 | ||
2387 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) | 2411 | if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) |
2388 | return sbi->s_stripe; | 2412 | ret = sbi->s_stripe; |
2389 | 2413 | else if (stripe_width <= sbi->s_blocks_per_group) | |
2390 | if (stripe_width <= sbi->s_blocks_per_group) | 2414 | ret = stripe_width; |
2391 | return stripe_width; | 2415 | else if (stride <= sbi->s_blocks_per_group) |
2416 | ret = stride; | ||
2417 | else | ||
2418 | ret = 0; | ||
2392 | 2419 | ||
2393 | if (stride <= sbi->s_blocks_per_group) | 2420 | /* |
2394 | return stride; | 2421 | * If the stripe width is 1, this makes no sense and |
2422 | * we set it to 0 to turn off stripe handling code. | ||
2423 | */ | ||
2424 | if (ret <= 1) | ||
2425 | ret = 0; | ||
2395 | 2426 | ||
2396 | return 0; | 2427 | return ret; |
2397 | } | 2428 | } |
2398 | 2429 | ||
2399 | /* sysfs supprt */ | 2430 | /* sysfs supprt */ |
@@ -3408,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3408 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); | 3439 | (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); |
3409 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | 3440 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / |
3410 | EXT4_DESC_PER_BLOCK(sb); | 3441 | EXT4_DESC_PER_BLOCK(sb); |
3411 | sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), | 3442 | sbi->s_group_desc = ext4_kvmalloc(db_count * |
3412 | GFP_KERNEL); | 3443 | sizeof(struct buffer_head *), |
3444 | GFP_KERNEL); | ||
3413 | if (sbi->s_group_desc == NULL) { | 3445 | if (sbi->s_group_desc == NULL) { |
3414 | ext4_msg(sb, KERN_ERR, "not enough memory"); | 3446 | ext4_msg(sb, KERN_ERR, "not enough memory"); |
3415 | goto failed_mount; | 3447 | goto failed_mount; |
@@ -3491,7 +3523,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3491 | 3523 | ||
3492 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ | 3524 | INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ |
3493 | mutex_init(&sbi->s_orphan_lock); | 3525 | mutex_init(&sbi->s_orphan_lock); |
3494 | mutex_init(&sbi->s_resize_lock); | 3526 | sbi->s_resize_flags = 0; |
3495 | 3527 | ||
3496 | sb->s_root = NULL; | 3528 | sb->s_root = NULL; |
3497 | 3529 | ||
@@ -3741,12 +3773,8 @@ failed_mount_wq: | |||
3741 | } | 3773 | } |
3742 | failed_mount3: | 3774 | failed_mount3: |
3743 | del_timer(&sbi->s_err_report); | 3775 | del_timer(&sbi->s_err_report); |
3744 | if (sbi->s_flex_groups) { | 3776 | if (sbi->s_flex_groups) |
3745 | if (is_vmalloc_addr(sbi->s_flex_groups)) | 3777 | ext4_kvfree(sbi->s_flex_groups); |
3746 | vfree(sbi->s_flex_groups); | ||
3747 | else | ||
3748 | kfree(sbi->s_flex_groups); | ||
3749 | } | ||
3750 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 3778 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
3751 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 3779 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
3752 | percpu_counter_destroy(&sbi->s_dirs_counter); | 3780 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -3756,7 +3784,7 @@ failed_mount3: | |||
3756 | failed_mount2: | 3784 | failed_mount2: |
3757 | for (i = 0; i < db_count; i++) | 3785 | for (i = 0; i < db_count; i++) |
3758 | brelse(sbi->s_group_desc[i]); | 3786 | brelse(sbi->s_group_desc[i]); |
3759 | kfree(sbi->s_group_desc); | 3787 | ext4_kvfree(sbi->s_group_desc); |
3760 | failed_mount: | 3788 | failed_mount: |
3761 | if (sbi->s_proc) { | 3789 | if (sbi->s_proc) { |
3762 | remove_proc_entry(sb->s_id, ext4_proc_root); | 3790 | remove_proc_entry(sb->s_id, ext4_proc_root); |
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h new file mode 100644 index 000000000000..011ba6670d99 --- /dev/null +++ b/fs/ext4/truncate.h | |||
@@ -0,0 +1,43 @@ | |||
1 | /* | ||
2 | * linux/fs/ext4/truncate.h | ||
3 | * | ||
4 | * Common inline functions needed for truncate support | ||
5 | */ | ||
6 | |||
7 | /* | ||
8 | * Truncate blocks that were not used by write. We have to truncate the | ||
9 | * pagecache as well so that corresponding buffers get properly unmapped. | ||
10 | */ | ||
11 | static inline void ext4_truncate_failed_write(struct inode *inode) | ||
12 | { | ||
13 | truncate_inode_pages(inode->i_mapping, inode->i_size); | ||
14 | ext4_truncate(inode); | ||
15 | } | ||
16 | |||
17 | /* | ||
18 | * Work out how many blocks we need to proceed with the next chunk of a | ||
19 | * truncate transaction. | ||
20 | */ | ||
21 | static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) | ||
22 | { | ||
23 | ext4_lblk_t needed; | ||
24 | |||
25 | needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); | ||
26 | |||
27 | /* Give ourselves just enough room to cope with inodes in which | ||
28 | * i_blocks is corrupt: we've seen disk corruptions in the past | ||
29 | * which resulted in random data in an inode which looked enough | ||
30 | * like a regular file for ext4 to try to delete it. Things | ||
31 | * will go a bit crazy if that happens, but at least we should | ||
32 | * try not to panic the whole kernel. */ | ||
33 | if (needed < 2) | ||
34 | needed = 2; | ||
35 | |||
36 | /* But we need to bound the transaction so we don't overflow the | ||
37 | * journal. */ | ||
38 | if (needed > EXT4_MAX_TRANS_DATA) | ||
39 | needed = EXT4_MAX_TRANS_DATA; | ||
40 | |||
41 | return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; | ||
42 | } | ||
43 | |||
diff --git a/fs/generic_acl.c b/fs/generic_acl.c index d5e33a077a67..d0dddaceac59 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c | |||
@@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, | |||
82 | return PTR_ERR(acl); | 82 | return PTR_ERR(acl); |
83 | } | 83 | } |
84 | if (acl) { | 84 | if (acl) { |
85 | mode_t mode; | ||
86 | |||
87 | error = posix_acl_valid(acl); | 85 | error = posix_acl_valid(acl); |
88 | if (error) | 86 | if (error) |
89 | goto failed; | 87 | goto failed; |
90 | switch (type) { | 88 | switch (type) { |
91 | case ACL_TYPE_ACCESS: | 89 | case ACL_TYPE_ACCESS: |
92 | mode = inode->i_mode; | 90 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
93 | error = posix_acl_equiv_mode(acl, &mode); | ||
94 | if (error < 0) | 91 | if (error < 0) |
95 | goto failed; | 92 | goto failed; |
96 | inode->i_mode = mode; | ||
97 | inode->i_ctime = CURRENT_TIME; | 93 | inode->i_ctime = CURRENT_TIME; |
98 | if (error == 0) { | 94 | if (error == 0) { |
99 | posix_acl_release(acl); | 95 | posix_acl_release(acl); |
@@ -125,21 +121,20 @@ int | |||
125 | generic_acl_init(struct inode *inode, struct inode *dir) | 121 | generic_acl_init(struct inode *inode, struct inode *dir) |
126 | { | 122 | { |
127 | struct posix_acl *acl = NULL; | 123 | struct posix_acl *acl = NULL; |
128 | mode_t mode = inode->i_mode; | ||
129 | int error; | 124 | int error; |
130 | 125 | ||
131 | inode->i_mode = mode & ~current_umask(); | ||
132 | if (!S_ISLNK(inode->i_mode)) | 126 | if (!S_ISLNK(inode->i_mode)) |
133 | acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); | 127 | acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); |
134 | if (acl) { | 128 | if (acl) { |
135 | if (S_ISDIR(inode->i_mode)) | 129 | if (S_ISDIR(inode->i_mode)) |
136 | set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); | 130 | set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); |
137 | error = posix_acl_create(&acl, GFP_KERNEL, &mode); | 131 | error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); |
138 | if (error < 0) | 132 | if (error < 0) |
139 | return error; | 133 | return error; |
140 | inode->i_mode = mode; | ||
141 | if (error > 0) | 134 | if (error > 0) |
142 | set_cached_acl(inode, ACL_TYPE_ACCESS, acl); | 135 | set_cached_acl(inode, ACL_TYPE_ACCESS, acl); |
136 | } else { | ||
137 | inode->i_mode &= ~current_umask(); | ||
143 | } | 138 | } |
144 | error = 0; | 139 | error = 0; |
145 | 140 | ||
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 884c9af0542f..34501b64bc47 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c | |||
@@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type) | |||
72 | return gfs2_acl_get(GFS2_I(inode), type); | 72 | return gfs2_acl_get(GFS2_I(inode), type); |
73 | } | 73 | } |
74 | 74 | ||
75 | static int gfs2_set_mode(struct inode *inode, mode_t mode) | 75 | static int gfs2_set_mode(struct inode *inode, umode_t mode) |
76 | { | 76 | { |
77 | int error = 0; | 77 | int error = 0; |
78 | 78 | ||
@@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode) | |||
117 | { | 117 | { |
118 | struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); | 118 | struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); |
119 | struct posix_acl *acl; | 119 | struct posix_acl *acl; |
120 | mode_t mode = inode->i_mode; | 120 | umode_t mode = inode->i_mode; |
121 | int error = 0; | 121 | int error = 0; |
122 | 122 | ||
123 | if (!sdp->sd_args.ar_posix_acl) | 123 | if (!sdp->sd_args.ar_posix_acl) |
@@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name, | |||
276 | goto out_release; | 276 | goto out_release; |
277 | 277 | ||
278 | if (type == ACL_TYPE_ACCESS) { | 278 | if (type == ACL_TYPE_ACCESS) { |
279 | mode_t mode = inode->i_mode; | 279 | umode_t mode = inode->i_mode; |
280 | error = posix_acl_equiv_mode(acl, &mode); | 280 | error = posix_acl_equiv_mode(acl, &mode); |
281 | 281 | ||
282 | if (error <= 0) { | 282 | if (error <= 0) { |
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index 8635be5ffd97..970ea987b3f6 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/statfs.h> | 16 | #include <linux/statfs.h> |
17 | #include <linux/types.h> | 17 | #include <linux/types.h> |
18 | #include <linux/pid_namespace.h> | 18 | #include <linux/pid_namespace.h> |
19 | #include <linux/namei.h> | ||
19 | #include <asm/uaccess.h> | 20 | #include <asm/uaccess.h> |
20 | #include "os.h" | 21 | #include "os.h" |
21 | 22 | ||
diff --git a/fs/inode.c b/fs/inode.c index d0c72ff6b30e..73920d555c88 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |||
143 | inode->i_op = &empty_iops; | 143 | inode->i_op = &empty_iops; |
144 | inode->i_fop = &empty_fops; | 144 | inode->i_fop = &empty_fops; |
145 | inode->i_nlink = 1; | 145 | inode->i_nlink = 1; |
146 | inode->i_opflags = 0; | ||
146 | inode->i_uid = 0; | 147 | inode->i_uid = 0; |
147 | inode->i_gid = 0; | 148 | inode->i_gid = 0; |
148 | atomic_set(&inode->i_writecount, 0); | 149 | atomic_set(&inode->i_writecount, 0); |
@@ -399,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval) | |||
399 | EXPORT_SYMBOL(__insert_inode_hash); | 400 | EXPORT_SYMBOL(__insert_inode_hash); |
400 | 401 | ||
401 | /** | 402 | /** |
402 | * remove_inode_hash - remove an inode from the hash | 403 | * __remove_inode_hash - remove an inode from the hash |
403 | * @inode: inode to unhash | 404 | * @inode: inode to unhash |
404 | * | 405 | * |
405 | * Remove an inode from the superblock. | 406 | * Remove an inode from the superblock. |
406 | */ | 407 | */ |
407 | void remove_inode_hash(struct inode *inode) | 408 | void __remove_inode_hash(struct inode *inode) |
408 | { | 409 | { |
409 | spin_lock(&inode_hash_lock); | 410 | spin_lock(&inode_hash_lock); |
410 | spin_lock(&inode->i_lock); | 411 | spin_lock(&inode->i_lock); |
@@ -412,7 +413,7 @@ void remove_inode_hash(struct inode *inode) | |||
412 | spin_unlock(&inode->i_lock); | 413 | spin_unlock(&inode->i_lock); |
413 | spin_unlock(&inode_hash_lock); | 414 | spin_unlock(&inode_hash_lock); |
414 | } | 415 | } |
415 | EXPORT_SYMBOL(remove_inode_hash); | 416 | EXPORT_SYMBOL(__remove_inode_hash); |
416 | 417 | ||
417 | void end_writeback(struct inode *inode) | 418 | void end_writeback(struct inode *inode) |
418 | { | 419 | { |
@@ -454,7 +455,9 @@ static void evict(struct inode *inode) | |||
454 | BUG_ON(!(inode->i_state & I_FREEING)); | 455 | BUG_ON(!(inode->i_state & I_FREEING)); |
455 | BUG_ON(!list_empty(&inode->i_lru)); | 456 | BUG_ON(!list_empty(&inode->i_lru)); |
456 | 457 | ||
457 | inode_wb_list_del(inode); | 458 | if (!list_empty(&inode->i_wb_list)) |
459 | inode_wb_list_del(inode); | ||
460 | |||
458 | inode_sb_list_del(inode); | 461 | inode_sb_list_del(inode); |
459 | 462 | ||
460 | if (op->evict_inode) { | 463 | if (op->evict_inode) { |
@@ -1328,7 +1331,8 @@ static void iput_final(struct inode *inode) | |||
1328 | } | 1331 | } |
1329 | 1332 | ||
1330 | inode->i_state |= I_FREEING; | 1333 | inode->i_state |= I_FREEING; |
1331 | inode_lru_list_del(inode); | 1334 | if (!list_empty(&inode->i_lru)) |
1335 | inode_lru_list_del(inode); | ||
1332 | spin_unlock(&inode->i_lock); | 1336 | spin_unlock(&inode->i_lock); |
1333 | 1337 | ||
1334 | evict(inode); | 1338 | evict(inode); |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 2c62c5aae82f..16a698bd906d 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -257,9 +257,12 @@ static void | |||
257 | __flush_batch(journal_t *journal, int *batch_count) | 257 | __flush_batch(journal_t *journal, int *batch_count) |
258 | { | 258 | { |
259 | int i; | 259 | int i; |
260 | struct blk_plug plug; | ||
260 | 261 | ||
262 | blk_start_plug(&plug); | ||
261 | for (i = 0; i < *batch_count; i++) | 263 | for (i = 0; i < *batch_count; i++) |
262 | write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); | 264 | write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); |
265 | blk_finish_plug(&plug); | ||
263 | 266 | ||
264 | for (i = 0; i < *batch_count; i++) { | 267 | for (i = 0; i < *batch_count; i++) { |
265 | struct buffer_head *bh = journal->j_chkpt_bhs[i]; | 268 | struct buffer_head *bh = journal->j_chkpt_bhs[i]; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0dfa5b598e68..f24df13adc4e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -2390,73 +2390,6 @@ static void __exit journal_exit(void) | |||
2390 | jbd2_journal_destroy_caches(); | 2390 | jbd2_journal_destroy_caches(); |
2391 | } | 2391 | } |
2392 | 2392 | ||
2393 | /* | ||
2394 | * jbd2_dev_to_name is a utility function used by the jbd2 and ext4 | ||
2395 | * tracing infrastructure to map a dev_t to a device name. | ||
2396 | * | ||
2397 | * The caller should use rcu_read_lock() in order to make sure the | ||
2398 | * device name stays valid until its done with it. We use | ||
2399 | * rcu_read_lock() as well to make sure we're safe in case the caller | ||
2400 | * gets sloppy, and because rcu_read_lock() is cheap and can be safely | ||
2401 | * nested. | ||
2402 | */ | ||
2403 | struct devname_cache { | ||
2404 | struct rcu_head rcu; | ||
2405 | dev_t device; | ||
2406 | char devname[BDEVNAME_SIZE]; | ||
2407 | }; | ||
2408 | #define CACHE_SIZE_BITS 6 | ||
2409 | static struct devname_cache *devcache[1 << CACHE_SIZE_BITS]; | ||
2410 | static DEFINE_SPINLOCK(devname_cache_lock); | ||
2411 | |||
2412 | static void free_devcache(struct rcu_head *rcu) | ||
2413 | { | ||
2414 | kfree(rcu); | ||
2415 | } | ||
2416 | |||
2417 | const char *jbd2_dev_to_name(dev_t device) | ||
2418 | { | ||
2419 | int i = hash_32(device, CACHE_SIZE_BITS); | ||
2420 | char *ret; | ||
2421 | struct block_device *bd; | ||
2422 | static struct devname_cache *new_dev; | ||
2423 | |||
2424 | rcu_read_lock(); | ||
2425 | if (devcache[i] && devcache[i]->device == device) { | ||
2426 | ret = devcache[i]->devname; | ||
2427 | rcu_read_unlock(); | ||
2428 | return ret; | ||
2429 | } | ||
2430 | rcu_read_unlock(); | ||
2431 | |||
2432 | new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); | ||
2433 | if (!new_dev) | ||
2434 | return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ | ||
2435 | bd = bdget(device); | ||
2436 | spin_lock(&devname_cache_lock); | ||
2437 | if (devcache[i]) { | ||
2438 | if (devcache[i]->device == device) { | ||
2439 | kfree(new_dev); | ||
2440 | bdput(bd); | ||
2441 | ret = devcache[i]->devname; | ||
2442 | spin_unlock(&devname_cache_lock); | ||
2443 | return ret; | ||
2444 | } | ||
2445 | call_rcu(&devcache[i]->rcu, free_devcache); | ||
2446 | } | ||
2447 | devcache[i] = new_dev; | ||
2448 | devcache[i]->device = device; | ||
2449 | if (bd) { | ||
2450 | bdevname(bd, devcache[i]->devname); | ||
2451 | bdput(bd); | ||
2452 | } else | ||
2453 | __bdevname(device, devcache[i]->devname); | ||
2454 | ret = devcache[i]->devname; | ||
2455 | spin_unlock(&devname_cache_lock); | ||
2456 | return ret; | ||
2457 | } | ||
2458 | EXPORT_SYMBOL(jbd2_dev_to_name); | ||
2459 | |||
2460 | MODULE_LICENSE("GPL"); | 2393 | MODULE_LICENSE("GPL"); |
2461 | module_init(journal_init); | 2394 | module_init(journal_init); |
2462 | module_exit(journal_exit); | 2395 | module_exit(journal_exit); |
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 27c511a1cf05..926d02068a14 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c | |||
@@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
227 | case ACL_TYPE_ACCESS: | 227 | case ACL_TYPE_ACCESS: |
228 | xprefix = JFFS2_XPREFIX_ACL_ACCESS; | 228 | xprefix = JFFS2_XPREFIX_ACL_ACCESS; |
229 | if (acl) { | 229 | if (acl) { |
230 | mode_t mode = inode->i_mode; | 230 | umode_t mode = inode->i_mode; |
231 | rc = posix_acl_equiv_mode(acl, &mode); | 231 | rc = posix_acl_equiv_mode(acl, &mode); |
232 | if (rc < 0) | 232 | if (rc < 0) |
233 | return rc; | 233 | return rc; |
@@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
259 | return rc; | 259 | return rc; |
260 | } | 260 | } |
261 | 261 | ||
262 | int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) | 262 | int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode) |
263 | { | 263 | { |
264 | struct posix_acl *acl; | 264 | struct posix_acl *acl; |
265 | int rc; | 265 | int rc; |
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index b3421c78d9f8..9b477246f2a6 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h | |||
@@ -28,7 +28,7 @@ struct jffs2_acl_header { | |||
28 | 28 | ||
29 | struct posix_acl *jffs2_get_acl(struct inode *inode, int type); | 29 | struct posix_acl *jffs2_get_acl(struct inode *inode, int type); |
30 | extern int jffs2_acl_chmod(struct inode *); | 30 | extern int jffs2_acl_chmod(struct inode *); |
31 | extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); | 31 | extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); |
32 | extern int jffs2_init_acl_post(struct inode *); | 32 | extern int jffs2_init_acl_post(struct inode *); |
33 | 33 | ||
34 | extern const struct xattr_handler jffs2_acl_access_xattr_handler; | 34 | extern const struct xattr_handler jffs2_acl_access_xattr_handler; |
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index b81b35ddf4e4..bbcb9755dd2b 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c | |||
@@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) | |||
406 | 406 | ||
407 | /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, | 407 | /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, |
408 | fill in the raw_inode while you're at it. */ | 408 | fill in the raw_inode while you're at it. */ |
409 | struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) | 409 | struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri) |
410 | { | 410 | { |
411 | struct inode *inode; | 411 | struct inode *inode; |
412 | struct super_block *sb = dir_i->i_sb; | 412 | struct super_block *sb = dir_i->i_sb; |
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 526979c607b6..6c1755c59c0f 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h | |||
@@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *); | |||
173 | struct inode *jffs2_iget(struct super_block *, unsigned long); | 173 | struct inode *jffs2_iget(struct super_block *, unsigned long); |
174 | void jffs2_evict_inode (struct inode *); | 174 | void jffs2_evict_inode (struct inode *); |
175 | void jffs2_dirty_inode(struct inode *inode, int flags); | 175 | void jffs2_dirty_inode(struct inode *inode, int flags); |
176 | struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, | 176 | struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, |
177 | struct jffs2_raw_inode *ri); | 177 | struct jffs2_raw_inode *ri); |
178 | int jffs2_statfs (struct dentry *, struct kstatfs *); | 178 | int jffs2_statfs (struct dentry *, struct kstatfs *); |
179 | int jffs2_remount_fs (struct super_block *, int *, char *); | 179 | int jffs2_remount_fs (struct super_block *, int *, char *); |
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index b3a32caf2b45..45559dc3ea2f 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c | |||
@@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) | |||
127 | return PTR_ERR(acl); | 127 | return PTR_ERR(acl); |
128 | 128 | ||
129 | if (acl) { | 129 | if (acl) { |
130 | mode_t mode = inode->i_mode; | ||
131 | if (S_ISDIR(inode->i_mode)) { | 130 | if (S_ISDIR(inode->i_mode)) { |
132 | rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); | 131 | rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); |
133 | if (rc) | 132 | if (rc) |
134 | goto cleanup; | 133 | goto cleanup; |
135 | } | 134 | } |
136 | rc = posix_acl_create(&acl, GFP_KERNEL, &mode); | 135 | rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode); |
137 | if (rc < 0) | 136 | if (rc < 0) |
138 | goto cleanup; /* posix_acl_release(NULL) is no-op */ | 137 | goto cleanup; /* posix_acl_release(NULL) is no-op */ |
139 | inode->i_mode = mode; | ||
140 | if (rc > 0) | 138 | if (rc > 0) |
141 | rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); | 139 | rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); |
142 | cleanup: | 140 | cleanup: |
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24838f1eeee5..e87fedef23db 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c | |||
@@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name, | |||
693 | return rc; | 693 | return rc; |
694 | } | 694 | } |
695 | if (acl) { | 695 | if (acl) { |
696 | mode_t mode = inode->i_mode; | 696 | rc = posix_acl_equiv_mode(acl, &inode->i_mode); |
697 | rc = posix_acl_equiv_mode(acl, &mode); | ||
698 | posix_acl_release(acl); | 697 | posix_acl_release(acl); |
699 | if (rc < 0) { | 698 | if (rc < 0) { |
700 | printk(KERN_ERR | 699 | printk(KERN_ERR |
@@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name, | |||
702 | rc); | 701 | rc); |
703 | return rc; | 702 | return rc; |
704 | } | 703 | } |
705 | inode->i_mode = mode; | ||
706 | mark_inode_dirty(inode); | 704 | mark_inode_dirty(inode); |
707 | } | 705 | } |
708 | /* | 706 | /* |
diff --git a/fs/namei.c b/fs/namei.c index f8c69d373793..2826db35dc25 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -179,19 +179,14 @@ static int check_acl(struct inode *inode, int mask) | |||
179 | #ifdef CONFIG_FS_POSIX_ACL | 179 | #ifdef CONFIG_FS_POSIX_ACL |
180 | struct posix_acl *acl; | 180 | struct posix_acl *acl; |
181 | 181 | ||
182 | /* | ||
183 | * Under RCU walk, we cannot even do a "get_cached_acl()", | ||
184 | * because that involves locking and getting a refcount on | ||
185 | * a cached ACL. | ||
186 | * | ||
187 | * So the only case we handle during RCU walking is the | ||
188 | * case of a cached "no ACL at all", which needs no locks | ||
189 | * or refcounts. | ||
190 | */ | ||
191 | if (mask & MAY_NOT_BLOCK) { | 182 | if (mask & MAY_NOT_BLOCK) { |
192 | if (negative_cached_acl(inode, ACL_TYPE_ACCESS)) | 183 | acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); |
184 | if (!acl) | ||
193 | return -EAGAIN; | 185 | return -EAGAIN; |
194 | return -ECHILD; | 186 | /* no ->get_acl() calls in RCU mode... */ |
187 | if (acl == ACL_NOT_CACHED) | ||
188 | return -ECHILD; | ||
189 | return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK); | ||
195 | } | 190 | } |
196 | 191 | ||
197 | acl = get_cached_acl(inode, ACL_TYPE_ACCESS); | 192 | acl = get_cached_acl(inode, ACL_TYPE_ACCESS); |
@@ -313,6 +308,26 @@ int generic_permission(struct inode *inode, int mask) | |||
313 | return -EACCES; | 308 | return -EACCES; |
314 | } | 309 | } |
315 | 310 | ||
311 | /* | ||
312 | * We _really_ want to just do "generic_permission()" without | ||
313 | * even looking at the inode->i_op values. So we keep a cache | ||
314 | * flag in inode->i_opflags, that says "this has not special | ||
315 | * permission function, use the fast case". | ||
316 | */ | ||
317 | static inline int do_inode_permission(struct inode *inode, int mask) | ||
318 | { | ||
319 | if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { | ||
320 | if (likely(inode->i_op->permission)) | ||
321 | return inode->i_op->permission(inode, mask); | ||
322 | |||
323 | /* This gets set once for the inode lifetime */ | ||
324 | spin_lock(&inode->i_lock); | ||
325 | inode->i_opflags |= IOP_FASTPERM; | ||
326 | spin_unlock(&inode->i_lock); | ||
327 | } | ||
328 | return generic_permission(inode, mask); | ||
329 | } | ||
330 | |||
316 | /** | 331 | /** |
317 | * inode_permission - check for access rights to a given inode | 332 | * inode_permission - check for access rights to a given inode |
318 | * @inode: inode to check permission on | 333 | * @inode: inode to check permission on |
@@ -327,7 +342,7 @@ int inode_permission(struct inode *inode, int mask) | |||
327 | { | 342 | { |
328 | int retval; | 343 | int retval; |
329 | 344 | ||
330 | if (mask & MAY_WRITE) { | 345 | if (unlikely(mask & MAY_WRITE)) { |
331 | umode_t mode = inode->i_mode; | 346 | umode_t mode = inode->i_mode; |
332 | 347 | ||
333 | /* | 348 | /* |
@@ -344,11 +359,7 @@ int inode_permission(struct inode *inode, int mask) | |||
344 | return -EACCES; | 359 | return -EACCES; |
345 | } | 360 | } |
346 | 361 | ||
347 | if (inode->i_op->permission) | 362 | retval = do_inode_permission(inode, mask); |
348 | retval = inode->i_op->permission(inode, mask); | ||
349 | else | ||
350 | retval = generic_permission(inode, mask); | ||
351 | |||
352 | if (retval) | 363 | if (retval) |
353 | return retval; | 364 | return retval; |
354 | 365 | ||
@@ -716,19 +727,25 @@ static int follow_automount(struct path *path, unsigned flags, | |||
716 | if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) | 727 | if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) |
717 | return -EISDIR; /* we actually want to stop here */ | 728 | return -EISDIR; /* we actually want to stop here */ |
718 | 729 | ||
719 | /* We want to mount if someone is trying to open/create a file of any | 730 | /* |
720 | * type under the mountpoint, wants to traverse through the mountpoint | ||
721 | * or wants to open the mounted directory. | ||
722 | * | ||
723 | * We don't want to mount if someone's just doing a stat and they've | 731 | * We don't want to mount if someone's just doing a stat and they've |
724 | * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and | 732 | * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and |
725 | * appended a '/' to the name. | 733 | * appended a '/' to the name. |
726 | */ | 734 | */ |
727 | if (!(flags & LOOKUP_FOLLOW) && | 735 | if (!(flags & LOOKUP_FOLLOW)) { |
728 | !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | | 736 | /* We do, however, want to mount if someone wants to open or |
729 | LOOKUP_OPEN | LOOKUP_CREATE))) | 737 | * create a file of any type under the mountpoint, wants to |
730 | return -EISDIR; | 738 | * traverse through the mountpoint or wants to open the mounted |
731 | 739 | * directory. | |
740 | * Also, autofs may mark negative dentries as being automount | ||
741 | * points. These will need the attentions of the daemon to | ||
742 | * instantiate them before they can be used. | ||
743 | */ | ||
744 | if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | | ||
745 | LOOKUP_OPEN | LOOKUP_CREATE)) && | ||
746 | path->dentry->d_inode) | ||
747 | return -EISDIR; | ||
748 | } | ||
732 | current->total_link_count++; | 749 | current->total_link_count++; |
733 | if (current->total_link_count >= 40) | 750 | if (current->total_link_count >= 40) |
734 | return -ELOOP; | 751 | return -ELOOP; |
@@ -1244,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd) | |||
1244 | } | 1261 | } |
1245 | } | 1262 | } |
1246 | 1263 | ||
1264 | /* | ||
1265 | * Do we need to follow links? We _really_ want to be able | ||
1266 | * to do this check without having to look at inode->i_op, | ||
1267 | * so we keep a cache of "no, this doesn't need follow_link" | ||
1268 | * for the common case. | ||
1269 | */ | ||
1270 | static inline int should_follow_link(struct inode *inode, int follow) | ||
1271 | { | ||
1272 | if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { | ||
1273 | if (likely(inode->i_op->follow_link)) | ||
1274 | return follow; | ||
1275 | |||
1276 | /* This gets set once for the inode lifetime */ | ||
1277 | spin_lock(&inode->i_lock); | ||
1278 | inode->i_opflags |= IOP_NOFOLLOW; | ||
1279 | spin_unlock(&inode->i_lock); | ||
1280 | } | ||
1281 | return 0; | ||
1282 | } | ||
1283 | |||
1247 | static inline int walk_component(struct nameidata *nd, struct path *path, | 1284 | static inline int walk_component(struct nameidata *nd, struct path *path, |
1248 | struct qstr *name, int type, int follow) | 1285 | struct qstr *name, int type, int follow) |
1249 | { | 1286 | { |
@@ -1266,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, | |||
1266 | terminate_walk(nd); | 1303 | terminate_walk(nd); |
1267 | return -ENOENT; | 1304 | return -ENOENT; |
1268 | } | 1305 | } |
1269 | if (unlikely(inode->i_op->follow_link) && follow) { | 1306 | if (should_follow_link(inode, follow)) { |
1270 | if (nd->flags & LOOKUP_RCU) { | 1307 | if (nd->flags & LOOKUP_RCU) { |
1271 | if (unlikely(unlazy_walk(nd, path->dentry))) { | 1308 | if (unlikely(unlazy_walk(nd, path->dentry))) { |
1272 | terminate_walk(nd); | 1309 | terminate_walk(nd); |
@@ -1319,6 +1356,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) | |||
1319 | } | 1356 | } |
1320 | 1357 | ||
1321 | /* | 1358 | /* |
1359 | * We really don't want to look at inode->i_op->lookup | ||
1360 | * when we don't have to. So we keep a cache bit in | ||
1361 | * the inode ->i_opflags field that says "yes, we can | ||
1362 | * do lookup on this inode". | ||
1363 | */ | ||
1364 | static inline int can_lookup(struct inode *inode) | ||
1365 | { | ||
1366 | if (likely(inode->i_opflags & IOP_LOOKUP)) | ||
1367 | return 1; | ||
1368 | if (likely(!inode->i_op->lookup)) | ||
1369 | return 0; | ||
1370 | |||
1371 | /* We do this once for the lifetime of the inode */ | ||
1372 | spin_lock(&inode->i_lock); | ||
1373 | inode->i_opflags |= IOP_LOOKUP; | ||
1374 | spin_unlock(&inode->i_lock); | ||
1375 | return 1; | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1322 | * Name resolution. | 1379 | * Name resolution. |
1323 | * This is the basic name resolution function, turning a pathname into | 1380 | * This is the basic name resolution function, turning a pathname into |
1324 | * the final dentry. We expect 'base' to be positive and a directory. | 1381 | * the final dentry. We expect 'base' to be positive and a directory. |
@@ -1397,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) | |||
1397 | if (err) | 1454 | if (err) |
1398 | return err; | 1455 | return err; |
1399 | } | 1456 | } |
1457 | if (can_lookup(nd->inode)) | ||
1458 | continue; | ||
1400 | err = -ENOTDIR; | 1459 | err = -ENOTDIR; |
1401 | if (!nd->inode->i_op->lookup) | 1460 | break; |
1402 | break; | ||
1403 | continue; | ||
1404 | /* here ends the main loop */ | 1461 | /* here ends the main loop */ |
1405 | 1462 | ||
1406 | last_component: | 1463 | last_component: |
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index e49e73107e62..7ef23979896d 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c | |||
@@ -415,7 +415,7 @@ fail: | |||
415 | } | 415 | } |
416 | 416 | ||
417 | int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, | 417 | int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, |
418 | mode_t mode) | 418 | umode_t mode) |
419 | { | 419 | { |
420 | struct posix_acl *dfacl, *acl; | 420 | struct posix_acl *dfacl, *acl; |
421 | int error = 0; | 421 | int error = 0; |
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 38053d823eb0..85f1690ca08c 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c | |||
@@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, | |||
316 | int flags, struct nfs_open_context *ctx) | 316 | int flags, struct nfs_open_context *ctx) |
317 | { | 317 | { |
318 | struct nfs3_createdata *data; | 318 | struct nfs3_createdata *data; |
319 | mode_t mode = sattr->ia_mode; | 319 | umode_t mode = sattr->ia_mode; |
320 | int status = -ENOMEM; | 320 | int status = -ENOMEM; |
321 | 321 | ||
322 | dprintk("NFS call create %s\n", dentry->d_name.name); | 322 | dprintk("NFS call create %s\n", dentry->d_name.name); |
@@ -562,7 +562,7 @@ static int | |||
562 | nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) | 562 | nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) |
563 | { | 563 | { |
564 | struct nfs3_createdata *data; | 564 | struct nfs3_createdata *data; |
565 | int mode = sattr->ia_mode; | 565 | umode_t mode = sattr->ia_mode; |
566 | int status = -ENOMEM; | 566 | int status = -ENOMEM; |
567 | 567 | ||
568 | dprintk("NFS call mkdir %s\n", dentry->d_name.name); | 568 | dprintk("NFS call mkdir %s\n", dentry->d_name.name); |
@@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, | |||
681 | dev_t rdev) | 681 | dev_t rdev) |
682 | { | 682 | { |
683 | struct nfs3_createdata *data; | 683 | struct nfs3_createdata *data; |
684 | mode_t mode = sattr->ia_mode; | 684 | umode_t mode = sattr->ia_mode; |
685 | int status = -ENOMEM; | 685 | int status = -ENOMEM; |
686 | 686 | ||
687 | dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, | 687 | dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, |
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 783c58d9daf1..a7219075b4de 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c | |||
@@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle, | |||
247 | case ACL_TYPE_ACCESS: | 247 | case ACL_TYPE_ACCESS: |
248 | name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; | 248 | name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; |
249 | if (acl) { | 249 | if (acl) { |
250 | mode_t mode = inode->i_mode; | 250 | umode_t mode = inode->i_mode; |
251 | ret = posix_acl_equiv_mode(acl, &mode); | 251 | ret = posix_acl_equiv_mode(acl, &mode); |
252 | if (ret < 0) | 252 | if (ret < 0) |
253 | return ret; | 253 | return ret; |
@@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle, | |||
351 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 351 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
352 | struct posix_acl *acl = NULL; | 352 | struct posix_acl *acl = NULL; |
353 | int ret = 0, ret2; | 353 | int ret = 0, ret2; |
354 | mode_t mode; | 354 | umode_t mode; |
355 | 355 | ||
356 | if (!S_ISLNK(inode->i_mode)) { | 356 | if (!S_ISLNK(inode->i_mode)) { |
357 | if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { | 357 | if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { |
diff --git a/fs/posix_acl.c b/fs/posix_acl.c index d43729a760e2..10027b42b7e2 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c | |||
@@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl) | |||
149 | * file mode permission bits, or else 1. Returns -E... on error. | 149 | * file mode permission bits, or else 1. Returns -E... on error. |
150 | */ | 150 | */ |
151 | int | 151 | int |
152 | posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) | 152 | posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) |
153 | { | 153 | { |
154 | const struct posix_acl_entry *pa, *pe; | 154 | const struct posix_acl_entry *pa, *pe; |
155 | mode_t mode = 0; | 155 | umode_t mode = 0; |
156 | int not_equiv = 0; | 156 | int not_equiv = 0; |
157 | 157 | ||
158 | FOREACH_ACL_ENTRY(pa, acl, pe) { | 158 | FOREACH_ACL_ENTRY(pa, acl, pe) { |
@@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) | |||
188 | * Create an ACL representing the file mode permission bits of an inode. | 188 | * Create an ACL representing the file mode permission bits of an inode. |
189 | */ | 189 | */ |
190 | struct posix_acl * | 190 | struct posix_acl * |
191 | posix_acl_from_mode(mode_t mode, gfp_t flags) | 191 | posix_acl_from_mode(umode_t mode, gfp_t flags) |
192 | { | 192 | { |
193 | struct posix_acl *acl = posix_acl_alloc(3, flags); | 193 | struct posix_acl *acl = posix_acl_alloc(3, flags); |
194 | if (!acl) | 194 | if (!acl) |
@@ -279,11 +279,11 @@ check_perm: | |||
279 | * system calls. All permissions that are not granted by the acl are removed. | 279 | * system calls. All permissions that are not granted by the acl are removed. |
280 | * The permissions in the acl are changed to reflect the mode_p parameter. | 280 | * The permissions in the acl are changed to reflect the mode_p parameter. |
281 | */ | 281 | */ |
282 | static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) | 282 | static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) |
283 | { | 283 | { |
284 | struct posix_acl_entry *pa, *pe; | 284 | struct posix_acl_entry *pa, *pe; |
285 | struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; | 285 | struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; |
286 | mode_t mode = *mode_p; | 286 | umode_t mode = *mode_p; |
287 | int not_equiv = 0; | 287 | int not_equiv = 0; |
288 | 288 | ||
289 | /* assert(atomic_read(acl->a_refcount) == 1); */ | 289 | /* assert(atomic_read(acl->a_refcount) == 1); */ |
@@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) | |||
336 | /* | 336 | /* |
337 | * Modify the ACL for the chmod syscall. | 337 | * Modify the ACL for the chmod syscall. |
338 | */ | 338 | */ |
339 | static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) | 339 | static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode) |
340 | { | 340 | { |
341 | struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; | 341 | struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; |
342 | struct posix_acl_entry *pa, *pe; | 342 | struct posix_acl_entry *pa, *pe; |
@@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) | |||
382 | } | 382 | } |
383 | 383 | ||
384 | int | 384 | int |
385 | posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) | 385 | posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p) |
386 | { | 386 | { |
387 | struct posix_acl *clone = posix_acl_clone(*acl, gfp); | 387 | struct posix_acl *clone = posix_acl_clone(*acl, gfp); |
388 | int err = -ENOMEM; | 388 | int err = -ENOMEM; |
@@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) | |||
400 | EXPORT_SYMBOL(posix_acl_create); | 400 | EXPORT_SYMBOL(posix_acl_create); |
401 | 401 | ||
402 | int | 402 | int |
403 | posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) | 403 | posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) |
404 | { | 404 | { |
405 | struct posix_acl *clone = posix_acl_clone(*acl, gfp); | 405 | struct posix_acl *clone = posix_acl_clone(*acl, gfp); |
406 | int err = -ENOMEM; | 406 | int err = -ENOMEM; |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 08e3eccf9a12..5eb02069e1b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -1118,7 +1118,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
1118 | * Warn that /proc/pid/oom_adj is deprecated, see | 1118 | * Warn that /proc/pid/oom_adj is deprecated, see |
1119 | * Documentation/feature-removal-schedule.txt. | 1119 | * Documentation/feature-removal-schedule.txt. |
1120 | */ | 1120 | */ |
1121 | WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", | 1121 | printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", |
1122 | current->comm, task_pid_nr(current), task_pid_nr(task), | 1122 | current->comm, task_pid_nr(current), task_pid_nr(task), |
1123 | task_pid_nr(task)); | 1123 | task_pid_nr(task)); |
1124 | task->signal->oom_adj = oom_adjust; | 1124 | task->signal->oom_adj = oom_adjust; |
@@ -1919,6 +1919,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) | |||
1919 | spin_lock(&files->file_lock); | 1919 | spin_lock(&files->file_lock); |
1920 | file = fcheck_files(files, fd); | 1920 | file = fcheck_files(files, fd); |
1921 | if (file) { | 1921 | if (file) { |
1922 | unsigned int f_flags; | ||
1923 | struct fdtable *fdt; | ||
1924 | |||
1925 | fdt = files_fdtable(files); | ||
1926 | f_flags = file->f_flags & ~O_CLOEXEC; | ||
1927 | if (FD_ISSET(fd, fdt->close_on_exec)) | ||
1928 | f_flags |= O_CLOEXEC; | ||
1929 | |||
1922 | if (path) { | 1930 | if (path) { |
1923 | *path = file->f_path; | 1931 | *path = file->f_path; |
1924 | path_get(&file->f_path); | 1932 | path_get(&file->f_path); |
@@ -1928,7 +1936,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) | |||
1928 | "pos:\t%lli\n" | 1936 | "pos:\t%lli\n" |
1929 | "flags:\t0%o\n", | 1937 | "flags:\t0%o\n", |
1930 | (long long) file->f_pos, | 1938 | (long long) file->f_pos, |
1931 | file->f_flags); | 1939 | f_flags); |
1932 | spin_unlock(&files->file_lock); | 1940 | spin_unlock(&files->file_lock); |
1933 | put_files_struct(files); | 1941 | put_files_struct(files); |
1934 | return 0; | 1942 | return 0; |
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 977ed2723845..893b961dcfd8 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c | |||
@@ -39,8 +39,9 @@ | |||
39 | #define PSTORE_NAMELEN 64 | 39 | #define PSTORE_NAMELEN 64 |
40 | 40 | ||
41 | struct pstore_private { | 41 | struct pstore_private { |
42 | struct pstore_info *psi; | ||
43 | enum pstore_type_id type; | ||
42 | u64 id; | 44 | u64 id; |
43 | int (*erase)(u64); | ||
44 | ssize_t size; | 45 | ssize_t size; |
45 | char data[]; | 46 | char data[]; |
46 | }; | 47 | }; |
@@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) | |||
73 | { | 74 | { |
74 | struct pstore_private *p = dentry->d_inode->i_private; | 75 | struct pstore_private *p = dentry->d_inode->i_private; |
75 | 76 | ||
76 | p->erase(p->id); | 77 | p->psi->erase(p->type, p->id, p->psi); |
77 | 78 | ||
78 | return simple_unlink(dir, dentry); | 79 | return simple_unlink(dir, dentry); |
79 | } | 80 | } |
@@ -175,8 +176,8 @@ int pstore_is_mounted(void) | |||
175 | * Set the mtime & ctime to the date that this record was originally stored. | 176 | * Set the mtime & ctime to the date that this record was originally stored. |
176 | */ | 177 | */ |
177 | int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, | 178 | int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, |
178 | char *data, size_t size, | 179 | char *data, size_t size, struct timespec time, |
179 | struct timespec time, int (*erase)(u64)) | 180 | struct pstore_info *psi) |
180 | { | 181 | { |
181 | struct dentry *root = pstore_sb->s_root; | 182 | struct dentry *root = pstore_sb->s_root; |
182 | struct dentry *dentry; | 183 | struct dentry *dentry; |
@@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, | |||
192 | private = kmalloc(sizeof *private + size, GFP_KERNEL); | 193 | private = kmalloc(sizeof *private + size, GFP_KERNEL); |
193 | if (!private) | 194 | if (!private) |
194 | goto fail_alloc; | 195 | goto fail_alloc; |
196 | private->type = type; | ||
195 | private->id = id; | 197 | private->id = id; |
196 | private->erase = erase; | 198 | private->psi = psi; |
197 | 199 | ||
198 | switch (type) { | 200 | switch (type) { |
199 | case PSTORE_TYPE_DMESG: | 201 | case PSTORE_TYPE_DMESG: |
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 8c9f23eb1645..611c1b3c46fa 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h | |||
@@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int); | |||
2 | extern void pstore_get_records(void); | 2 | extern void pstore_get_records(void); |
3 | extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, | 3 | extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, |
4 | char *data, size_t size, | 4 | char *data, size_t size, |
5 | struct timespec time, int (*erase)(u64)); | 5 | struct timespec time, struct pstore_info *psi); |
6 | extern int pstore_is_mounted(void); | 6 | extern int pstore_is_mounted(void); |
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f2c3ff20ea68..c5300ec31696 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c | |||
@@ -37,6 +37,8 @@ | |||
37 | static DEFINE_SPINLOCK(pstore_lock); | 37 | static DEFINE_SPINLOCK(pstore_lock); |
38 | static struct pstore_info *psinfo; | 38 | static struct pstore_info *psinfo; |
39 | 39 | ||
40 | static char *backend; | ||
41 | |||
40 | /* How much of the console log to snapshot */ | 42 | /* How much of the console log to snapshot */ |
41 | static unsigned long kmsg_bytes = 10240; | 43 | static unsigned long kmsg_bytes = 10240; |
42 | 44 | ||
@@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, | |||
67 | unsigned long size, total = 0; | 69 | unsigned long size, total = 0; |
68 | char *dst, *why; | 70 | char *dst, *why; |
69 | u64 id; | 71 | u64 id; |
70 | int hsize, part = 1; | 72 | int hsize; |
73 | unsigned int part = 1; | ||
71 | 74 | ||
72 | if (reason < ARRAY_SIZE(reason_str)) | 75 | if (reason < ARRAY_SIZE(reason_str)) |
73 | why = reason_str[reason]; | 76 | why = reason_str[reason]; |
@@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, | |||
78 | oopscount++; | 81 | oopscount++; |
79 | while (total < kmsg_bytes) { | 82 | while (total < kmsg_bytes) { |
80 | dst = psinfo->buf; | 83 | dst = psinfo->buf; |
81 | hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); | 84 | hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); |
82 | size = psinfo->bufsize - hsize; | 85 | size = psinfo->bufsize - hsize; |
83 | dst += hsize; | 86 | dst += hsize; |
84 | 87 | ||
@@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper, | |||
94 | memcpy(dst, s1 + s1_start, l1_cpy); | 97 | memcpy(dst, s1 + s1_start, l1_cpy); |
95 | memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); | 98 | memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); |
96 | 99 | ||
97 | id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); | 100 | id = psinfo->write(PSTORE_TYPE_DMESG, part, |
101 | hsize + l1_cpy + l2_cpy, psinfo); | ||
98 | if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) | 102 | if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) |
99 | pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, | 103 | pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, |
100 | psinfo->buf, hsize + l1_cpy + l2_cpy, | 104 | psinfo->buf, hsize + l1_cpy + l2_cpy, |
101 | CURRENT_TIME, psinfo->erase); | 105 | CURRENT_TIME, psinfo); |
102 | l1 -= l1_cpy; | 106 | l1 -= l1_cpy; |
103 | l2 -= l2_cpy; | 107 | l2 -= l2_cpy; |
104 | total += l1_cpy + l2_cpy; | 108 | total += l1_cpy + l2_cpy; |
109 | part++; | ||
105 | } | 110 | } |
106 | mutex_unlock(&psinfo->buf_mutex); | 111 | mutex_unlock(&psinfo->buf_mutex); |
107 | } | 112 | } |
@@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi) | |||
128 | spin_unlock(&pstore_lock); | 133 | spin_unlock(&pstore_lock); |
129 | return -EBUSY; | 134 | return -EBUSY; |
130 | } | 135 | } |
136 | |||
137 | if (backend && strcmp(backend, psi->name)) { | ||
138 | spin_unlock(&pstore_lock); | ||
139 | return -EINVAL; | ||
140 | } | ||
141 | |||
131 | psinfo = psi; | 142 | psinfo = psi; |
132 | spin_unlock(&pstore_lock); | 143 | spin_unlock(&pstore_lock); |
133 | 144 | ||
@@ -166,9 +177,9 @@ void pstore_get_records(void) | |||
166 | if (rc) | 177 | if (rc) |
167 | goto out; | 178 | goto out; |
168 | 179 | ||
169 | while ((size = psi->read(&id, &type, &time)) > 0) { | 180 | while ((size = psi->read(&id, &type, &time, psi)) > 0) { |
170 | if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, | 181 | if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, |
171 | time, psi->erase)) | 182 | time, psi)) |
172 | failed++; | 183 | failed++; |
173 | } | 184 | } |
174 | psi->close(psi); | 185 | psi->close(psi); |
@@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size) | |||
196 | 207 | ||
197 | mutex_lock(&psinfo->buf_mutex); | 208 | mutex_lock(&psinfo->buf_mutex); |
198 | memcpy(psinfo->buf, buf, size); | 209 | memcpy(psinfo->buf, buf, size); |
199 | id = psinfo->write(type, size); | 210 | id = psinfo->write(type, 0, size, psinfo); |
200 | if (pstore_is_mounted()) | 211 | if (pstore_is_mounted()) |
201 | pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, | 212 | pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, |
202 | size, CURRENT_TIME, psinfo->erase); | 213 | size, CURRENT_TIME, psinfo); |
203 | mutex_unlock(&psinfo->buf_mutex); | 214 | mutex_unlock(&psinfo->buf_mutex); |
204 | 215 | ||
205 | return 0; | 216 | return 0; |
206 | } | 217 | } |
207 | EXPORT_SYMBOL_GPL(pstore_write); | 218 | EXPORT_SYMBOL_GPL(pstore_write); |
219 | |||
220 | module_param(backend, charp, 0444); | ||
221 | MODULE_PARM_DESC(backend, "Pstore backend to use"); | ||
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 7362cf4c946a..6da0396e5052 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c | |||
@@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, | |||
272 | case ACL_TYPE_ACCESS: | 272 | case ACL_TYPE_ACCESS: |
273 | name = POSIX_ACL_XATTR_ACCESS; | 273 | name = POSIX_ACL_XATTR_ACCESS; |
274 | if (acl) { | 274 | if (acl) { |
275 | mode_t mode = inode->i_mode; | 275 | error = posix_acl_equiv_mode(acl, &inode->i_mode); |
276 | error = posix_acl_equiv_mode(acl, &mode); | ||
277 | if (error < 0) | 276 | if (error < 0) |
278 | return error; | 277 | return error; |
279 | else { | 278 | else { |
280 | inode->i_mode = mode; | ||
281 | if (error == 0) | 279 | if (error == 0) |
282 | acl = NULL; | 280 | acl = NULL; |
283 | } | 281 | } |
@@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, | |||
354 | return PTR_ERR(acl); | 352 | return PTR_ERR(acl); |
355 | 353 | ||
356 | if (acl) { | 354 | if (acl) { |
357 | mode_t mode = inode->i_mode; | ||
358 | |||
359 | /* Copy the default ACL to the default ACL of a new directory */ | 355 | /* Copy the default ACL to the default ACL of a new directory */ |
360 | if (S_ISDIR(inode->i_mode)) { | 356 | if (S_ISDIR(inode->i_mode)) { |
361 | err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, | 357 | err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, |
@@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, | |||
366 | 362 | ||
367 | /* Now we reconcile the new ACL and the mode, | 363 | /* Now we reconcile the new ACL and the mode, |
368 | potentially modifying both */ | 364 | potentially modifying both */ |
369 | err = posix_acl_create(&acl, GFP_NOFS, &mode); | 365 | err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); |
370 | if (err < 0) | 366 | if (err < 0) |
371 | return err; | 367 | return err; |
372 | 368 | ||
373 | inode->i_mode = mode; | ||
374 | |||
375 | /* If we need an ACL.. */ | 369 | /* If we need an ACL.. */ |
376 | if (err > 0) | 370 | if (err > 0) |
377 | err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); | 371 | err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); |
diff --git a/fs/stack.c b/fs/stack.c index 4a6f7f440658..b4f2ab48a61f 100644 --- a/fs/stack.c +++ b/fs/stack.c | |||
@@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) | |||
29 | * | 29 | * |
30 | * We don't actually know what locking is used at the lower level; | 30 | * We don't actually know what locking is used at the lower level; |
31 | * but if it's a filesystem that supports quotas, it will be using | 31 | * but if it's a filesystem that supports quotas, it will be using |
32 | * i_lock as in inode_add_bytes(). tmpfs uses other locking, and | 32 | * i_lock as in inode_add_bytes(). |
33 | * its 32-bit is (just) able to exceed 2TB i_size with the aid of | ||
34 | * holes; but its i_blocks cannot carry into the upper long without | ||
35 | * almost 2TB swap - let's ignore that case. | ||
36 | */ | 33 | */ |
37 | if (sizeof(i_blocks) > sizeof(long)) | 34 | if (sizeof(i_blocks) > sizeof(long)) |
38 | spin_lock(&src->i_lock); | 35 | spin_lock(&src->i_lock); |
@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) | |||
27 | stat->uid = inode->i_uid; | 27 | stat->uid = inode->i_uid; |
28 | stat->gid = inode->i_gid; | 28 | stat->gid = inode->i_gid; |
29 | stat->rdev = inode->i_rdev; | 29 | stat->rdev = inode->i_rdev; |
30 | stat->size = i_size_read(inode); | ||
30 | stat->atime = inode->i_atime; | 31 | stat->atime = inode->i_atime; |
31 | stat->mtime = inode->i_mtime; | 32 | stat->mtime = inode->i_mtime; |
32 | stat->ctime = inode->i_ctime; | 33 | stat->ctime = inode->i_ctime; |
33 | stat->size = i_size_read(inode); | ||
34 | stat->blocks = inode->i_blocks; | ||
35 | stat->blksize = (1 << inode->i_blkbits); | 34 | stat->blksize = (1 << inode->i_blkbits); |
35 | stat->blocks = inode->i_blocks; | ||
36 | } | 36 | } |
37 | 37 | ||
38 | EXPORT_SYMBOL(generic_fillattr); | 38 | EXPORT_SYMBOL(generic_fillattr); |
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 44ce51656804..b6c4b3795c4a 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c | |||
@@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
221 | } | 221 | } |
222 | 222 | ||
223 | static int | 223 | static int |
224 | xfs_set_mode(struct inode *inode, mode_t mode) | 224 | xfs_set_mode(struct inode *inode, umode_t mode) |
225 | { | 225 | { |
226 | int error = 0; | 226 | int error = 0; |
227 | 227 | ||
@@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode) | |||
267 | int | 267 | int |
268 | xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) | 268 | xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) |
269 | { | 269 | { |
270 | mode_t mode = inode->i_mode; | 270 | umode_t mode = inode->i_mode; |
271 | int error = 0, inherit = 0; | 271 | int error = 0, inherit = 0; |
272 | 272 | ||
273 | if (S_ISDIR(inode->i_mode)) { | 273 | if (S_ISDIR(inode->i_mode)) { |
@@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, | |||
381 | goto out_release; | 381 | goto out_release; |
382 | 382 | ||
383 | if (type == ACL_TYPE_ACCESS) { | 383 | if (type == ACL_TYPE_ACCESS) { |
384 | mode_t mode = inode->i_mode; | 384 | umode_t mode = inode->i_mode; |
385 | error = posix_acl_equiv_mode(acl, &mode); | 385 | error = posix_acl_equiv_mode(acl, &mode); |
386 | 386 | ||
387 | if (error <= 0) { | 387 | if (error <= 0) { |