aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2011-08-08 02:20:26 -0400
committerDavid S. Miller <davem@davemloft.net>2011-08-08 02:20:26 -0400
commit19fd61785a580c60cba900c5171bfadb57dd5056 (patch)
tree1e491fb014be0dc03f4b6755bb94e73afd38c455 /fs
parent57569d0e12eaf31717e295960cd2a26f626c8e5b (diff)
parent8028837d71ba9904b17281b40f94b93e947fbe38 (diff)
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c6
-rw-r--r--fs/9p/acl.h4
-rw-r--r--fs/9p/vfs_inode_dotl.c6
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c27
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/ctree.h30
-rw-r--r--fs/btrfs/dir-item.c30
-rw-r--r--fs/btrfs/extent-tree.c45
-rw-r--r--fs/btrfs/extent_io.c139
-rw-r--r--fs/btrfs/extent_io.h20
-rw-r--r--fs/btrfs/extent_map.c155
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c21
-rw-r--r--fs/btrfs/inode.c145
-rw-r--r--fs/btrfs/ioctl.c3
-rw-r--r--fs/btrfs/ref-cache.c68
-rw-r--r--fs/btrfs/ref-cache.h52
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/transaction.c65
-rw-r--r--fs/btrfs/tree-log.c12
-rw-r--r--fs/btrfs/volumes.c12
-rw-r--r--fs/cifs/cifs_dfs_ref.c5
-rw-r--r--fs/cifs/cifsencrypt.c16
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifssmb.c6
-rw-r--r--fs/cifs/connect.c659
-rw-r--r--fs/cifs/dns_resolve.c4
-rw-r--r--fs/cifs/file.c27
-rw-r--r--fs/cifs/inode.c14
-rw-r--r--fs/cifs/misc.c11
-rw-r--r--fs/cifs/transport.c2
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/dcache.c72
-rw-r--r--fs/exofs/Kbuild5
-rw-r--r--fs/exofs/Kconfig4
-rw-r--r--fs/exofs/exofs.h159
-rw-r--r--fs/exofs/inode.c152
-rw-r--r--fs/exofs/ore.c (renamed from fs/exofs/ios.c)370
-rw-r--r--fs/exofs/pnfs.h45
-rw-r--r--fs/exofs/super.c251
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext3/acl.c9
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/acl.c9
-rw-r--r--fs/ext4/balloc.c48
-rw-r--r--fs/ext4/block_validity.c21
-rw-r--r--fs/ext4/ext4.h55
-rw-r--r--fs/ext4/extents.c129
-rw-r--r--fs/ext4/fsync.c26
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c1482
-rw-r--r--fs/ext4/inode.c1596
-rw-r--r--fs/ext4/ioctl.c12
-rw-r--r--fs/ext4/mballoc.c230
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/namei.c21
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/resize.c199
-rw-r--r--fs/ext4/super.c88
-rw-r--r--fs/ext4/truncate.h43
-rw-r--r--fs/generic_acl.c13
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/hppfs/hppfs.c1
-rw-r--r--fs/inode.c14
-rw-r--r--fs/jbd2/checkpoint.c5
-rw-r--r--fs/jbd2/journal.c67
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/xattr.c4
-rw-r--r--fs/namei.c117
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3proc.c6
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/posix_acl.c16
-rw-r--r--fs/proc/base.c12
-rw-r--r--fs/pstore/inode.c12
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c30
-rw-r--r--fs/reiserfs/xattr_acl.c10
-rw-r--r--fs/stack.c5
-rw-r--r--fs/stat.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c6
90 files changed, 3509 insertions, 3549 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index e9cb57f07546..9a1d42630751 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -182,11 +182,11 @@ int v9fs_set_create_acl(struct dentry *dentry,
182 return 0; 182 return 0;
183} 183}
184 184
185int v9fs_acl_mode(struct inode *dir, mode_t *modep, 185int v9fs_acl_mode(struct inode *dir, umode_t *modep,
186 struct posix_acl **dpacl, struct posix_acl **pacl) 186 struct posix_acl **dpacl, struct posix_acl **pacl)
187{ 187{
188 int retval = 0; 188 int retval = 0;
189 mode_t mode = *modep; 189 umode_t mode = *modep;
190 struct posix_acl *acl = NULL; 190 struct posix_acl *acl = NULL;
191 191
192 if (!S_ISLNK(mode)) { 192 if (!S_ISLNK(mode)) {
@@ -319,7 +319,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
319 case ACL_TYPE_ACCESS: 319 case ACL_TYPE_ACCESS:
320 name = POSIX_ACL_XATTR_ACCESS; 320 name = POSIX_ACL_XATTR_ACCESS;
321 if (acl) { 321 if (acl) {
322 mode_t mode = inode->i_mode; 322 umode_t mode = inode->i_mode;
323 retval = posix_acl_equiv_mode(acl, &mode); 323 retval = posix_acl_equiv_mode(acl, &mode);
324 if (retval < 0) 324 if (retval < 0)
325 goto err_out; 325 goto err_out;
diff --git a/fs/9p/acl.h b/fs/9p/acl.h
index ddb7ae19d971..559556411965 100644
--- a/fs/9p/acl.h
+++ b/fs/9p/acl.h
@@ -20,7 +20,7 @@ extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type);
20extern int v9fs_acl_chmod(struct dentry *); 20extern int v9fs_acl_chmod(struct dentry *);
21extern int v9fs_set_create_acl(struct dentry *, 21extern int v9fs_set_create_acl(struct dentry *,
22 struct posix_acl **, struct posix_acl **); 22 struct posix_acl **, struct posix_acl **);
23extern int v9fs_acl_mode(struct inode *dir, mode_t *modep, 23extern int v9fs_acl_mode(struct inode *dir, umode_t *modep,
24 struct posix_acl **dpacl, struct posix_acl **pacl); 24 struct posix_acl **dpacl, struct posix_acl **pacl);
25#else 25#else
26#define v9fs_iop_get_acl NULL 26#define v9fs_iop_get_acl NULL
@@ -38,7 +38,7 @@ static inline int v9fs_set_create_acl(struct dentry *dentry,
38{ 38{
39 return 0; 39 return 0;
40} 40}
41static inline int v9fs_acl_mode(struct inode *dir, mode_t *modep, 41static inline int v9fs_acl_mode(struct inode *dir, umode_t *modep,
42 struct posix_acl **dpacl, 42 struct posix_acl **dpacl,
43 struct posix_acl **pacl) 43 struct posix_acl **pacl)
44{ 44{
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 9a26dce5a99f..b6c8ed205192 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -206,7 +206,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
206 int err = 0; 206 int err = 0;
207 gid_t gid; 207 gid_t gid;
208 int flags; 208 int flags;
209 mode_t mode; 209 umode_t mode;
210 char *name = NULL; 210 char *name = NULL;
211 struct file *filp; 211 struct file *filp;
212 struct p9_qid qid; 212 struct p9_qid qid;
@@ -348,7 +348,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
348 struct p9_fid *fid = NULL, *dfid = NULL; 348 struct p9_fid *fid = NULL, *dfid = NULL;
349 gid_t gid; 349 gid_t gid;
350 char *name; 350 char *name;
351 mode_t mode; 351 umode_t mode;
352 struct inode *inode; 352 struct inode *inode;
353 struct p9_qid qid; 353 struct p9_qid qid;
354 struct dentry *dir_dentry; 354 struct dentry *dir_dentry;
@@ -751,7 +751,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
751 int err; 751 int err;
752 gid_t gid; 752 gid_t gid;
753 char *name; 753 char *name;
754 mode_t mode; 754 umode_t mode;
755 struct v9fs_session_info *v9ses; 755 struct v9fs_session_info *v9ses;
756 struct p9_fid *fid = NULL, *dfid = NULL; 756 struct p9_fid *fid = NULL, *dfid = NULL;
757 struct inode *inode; 757 struct inode *inode;
diff --git a/fs/Kconfig b/fs/Kconfig
index 19891aab9c6e..9fe0b349f4cd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -127,14 +127,21 @@ config TMPFS_POSIX_ACL
127 select TMPFS_XATTR 127 select TMPFS_XATTR
128 select GENERIC_ACL 128 select GENERIC_ACL
129 help 129 help
130 POSIX Access Control Lists (ACLs) support permissions for users and 130 POSIX Access Control Lists (ACLs) support additional access rights
131 groups beyond the owner/group/world scheme. 131 for users and groups beyond the standard owner/group/world scheme,
132 and this option selects support for ACLs specifically for tmpfs
133 filesystems.
134
135 If you've selected TMPFS, it's possible that you'll also need
136 this option as there are a number of Linux distros that require
137 POSIX ACL support under /dev for certain features to work properly.
138 For example, some distros need this feature for ALSA-related /dev
139 files for sound to work properly. In short, if you're not sure,
140 say Y.
132 141
133 To learn more about Access Control Lists, visit the POSIX ACLs for 142 To learn more about Access Control Lists, visit the POSIX ACLs for
134 Linux website <http://acl.bestbits.at/>. 143 Linux website <http://acl.bestbits.at/>.
135 144
136 If you don't know what Access Control Lists are, say N.
137
138config TMPFS_XATTR 145config TMPFS_XATTR
139 bool "Tmpfs extended attributes" 146 bool "Tmpfs extended attributes"
140 depends on TMPFS 147 depends on TMPFS
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f55aad4d1611..ff77262e887c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -387,6 +387,10 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
387 struct inode *bd_inode = filp->f_mapping->host; 387 struct inode *bd_inode = filp->f_mapping->host;
388 struct block_device *bdev = I_BDEV(bd_inode); 388 struct block_device *bdev = I_BDEV(bd_inode);
389 int error; 389 int error;
390
391 error = filemap_write_and_wait_range(filp->f_mapping, start, end);
392 if (error)
393 return error;
390 394
391 /* 395 /*
392 * There is no need to serialise calls to blkdev_issue_flush with 396 * There is no need to serialise calls to blkdev_issue_flush with
@@ -552,6 +556,7 @@ struct block_device *bdget(dev_t dev)
552 556
553 if (inode->i_state & I_NEW) { 557 if (inode->i_state & I_NEW) {
554 bdev->bd_contains = NULL; 558 bdev->bd_contains = NULL;
559 bdev->bd_super = NULL;
555 bdev->bd_inode = inode; 560 bdev->bd_inode = inode;
556 bdev->bd_block_size = (1 << inode->i_blkbits); 561 bdev->bd_block_size = (1 << inode->i_blkbits);
557 bdev->bd_part_count = 0; 562 bdev->bd_part_count = 0;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf1cd25..40e6ac08c21f 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
11
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 65a735d8f6e4..eb159aaa5a11 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,8 +28,6 @@
28#include "btrfs_inode.h" 28#include "btrfs_inode.h"
29#include "xattr.h" 29#include "xattr.h"
30 30
31#ifdef CONFIG_BTRFS_FS_POSIX_ACL
32
33struct posix_acl *btrfs_get_acl(struct inode *inode, int type) 31struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
34{ 32{
35 int size; 33 int size;
@@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
111 int ret, size = 0; 109 int ret, size = 0;
112 const char *name; 110 const char *name;
113 char *value = NULL; 111 char *value = NULL;
114 mode_t mode;
115 112
116 if (acl) { 113 if (acl) {
117 ret = posix_acl_valid(acl); 114 ret = posix_acl_valid(acl);
@@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
122 119
123 switch (type) { 120 switch (type) {
124 case ACL_TYPE_ACCESS: 121 case ACL_TYPE_ACCESS:
125 mode = inode->i_mode;
126 name = POSIX_ACL_XATTR_ACCESS; 122 name = POSIX_ACL_XATTR_ACCESS;
127 if (acl) { 123 if (acl) {
128 ret = posix_acl_equiv_mode(acl, &mode); 124 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
129 if (ret < 0) 125 if (ret < 0)
130 return ret; 126 return ret;
131 inode->i_mode = mode;
132 } 127 }
133 ret = 0; 128 ret = 0;
134 break; 129 break;
@@ -222,19 +217,16 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
222 } 217 }
223 218
224 if (IS_POSIXACL(dir) && acl) { 219 if (IS_POSIXACL(dir) && acl) {
225 mode_t mode = inode->i_mode;
226
227 if (S_ISDIR(inode->i_mode)) { 220 if (S_ISDIR(inode->i_mode)) {
228 ret = btrfs_set_acl(trans, inode, acl, 221 ret = btrfs_set_acl(trans, inode, acl,
229 ACL_TYPE_DEFAULT); 222 ACL_TYPE_DEFAULT);
230 if (ret) 223 if (ret)
231 goto failed; 224 goto failed;
232 } 225 }
233 ret = posix_acl_create(&acl, GFP_NOFS, &mode); 226 ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
234 if (ret < 0) 227 if (ret < 0)
235 return ret; 228 return ret;
236 229
237 inode->i_mode = mode;
238 if (ret > 0) { 230 if (ret > 0) {
239 /* we need an acl */ 231 /* we need an acl */
240 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS); 232 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
@@ -282,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = {
282 .get = btrfs_xattr_acl_get, 274 .get = btrfs_xattr_acl_get,
283 .set = btrfs_xattr_acl_set, 275 .set = btrfs_xattr_acl_set,
284}; 276};
285
286#else /* CONFIG_BTRFS_FS_POSIX_ACL */
287
288int btrfs_acl_chmod(struct inode *inode)
289{
290 return 0;
291}
292
293int btrfs_init_acl(struct btrfs_trans_handle *trans,
294 struct inode *inode, struct inode *dir)
295{
296 return 0;
297}
298
299#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfe42b03eaf9..8ec5d86f1734 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 u64 first_byte = disk_start; 338 u64 first_byte = disk_start;
339 struct block_device *bdev; 339 struct block_device *bdev;
340 int ret; 340 int ret;
341 int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
341 342
342 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 343 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
343 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 344 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 393 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
393 BUG_ON(ret); 394 BUG_ON(ret);
394 395
395 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 396 if (!skip_sum) {
396 BUG_ON(ret); 397 ret = btrfs_csum_one_bio(root, inode, bio,
398 start, 1);
399 BUG_ON(ret);
400 }
397 401
398 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 402 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
399 BUG_ON(ret); 403 BUG_ON(ret);
@@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
418 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 422 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
419 BUG_ON(ret); 423 BUG_ON(ret);
420 424
421 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 425 if (!skip_sum) {
422 BUG_ON(ret); 426 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
427 BUG_ON(ret);
428 }
423 429
424 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
425 BUG_ON(ret); 431 BUG_ON(ret);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 365c4e1dde04..0469263e327e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2406,8 +2406,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2406 btrfs_root_item *item, struct btrfs_key *key); 2406 btrfs_root_item *item, struct btrfs_key *key);
2407int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2407int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2408int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2408int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2409int btrfs_set_root_node(struct btrfs_root_item *item, 2409void btrfs_set_root_node(struct btrfs_root_item *item,
2410 struct extent_buffer *node); 2410 struct extent_buffer *node);
2411void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 2411void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2412 2412
2413/* dir-item.c */ 2413/* dir-item.c */
@@ -2523,6 +2523,14 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
2523#define PageChecked PageFsMisc 2523#define PageChecked PageFsMisc
2524#endif 2524#endif
2525 2525
2526/* This forces readahead on a given range of bytes in an inode */
2527static inline void btrfs_force_ra(struct address_space *mapping,
2528 struct file_ra_state *ra, struct file *file,
2529 pgoff_t offset, unsigned long req_size)
2530{
2531 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
2532}
2533
2526struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); 2534struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2527int btrfs_set_inode_index(struct inode *dir, u64 *index); 2535int btrfs_set_inode_index(struct inode *dir, u64 *index);
2528int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2536int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2551,9 +2559,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2551int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2559int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2552 size_t size, struct bio *bio, unsigned long bio_flags); 2560 size_t size, struct bio *bio, unsigned long bio_flags);
2553 2561
2554unsigned long btrfs_force_ra(struct address_space *mapping,
2555 struct file_ra_state *ra, struct file *file,
2556 pgoff_t offset, pgoff_t last_index);
2557int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2562int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2558int btrfs_readpage(struct file *file, struct page *page); 2563int btrfs_readpage(struct file *file, struct page *page);
2559void btrfs_evict_inode(struct inode *inode); 2564void btrfs_evict_inode(struct inode *inode);
@@ -2648,12 +2653,21 @@ do { \
2648/* acl.c */ 2653/* acl.c */
2649#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2654#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2650struct posix_acl *btrfs_get_acl(struct inode *inode, int type); 2655struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
2651#else
2652#define btrfs_get_acl NULL
2653#endif
2654int btrfs_init_acl(struct btrfs_trans_handle *trans, 2656int btrfs_init_acl(struct btrfs_trans_handle *trans,
2655 struct inode *inode, struct inode *dir); 2657 struct inode *inode, struct inode *dir);
2656int btrfs_acl_chmod(struct inode *inode); 2658int btrfs_acl_chmod(struct inode *inode);
2659#else
2660#define btrfs_get_acl NULL
2661static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
2662 struct inode *inode, struct inode *dir)
2663{
2664 return 0;
2665}
2666static inline int btrfs_acl_chmod(struct inode *inode)
2667{
2668 return 0;
2669}
2670#endif
2657 2671
2658/* relocation.c */ 2672/* relocation.c */
2659int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 2673int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index c360a848d97f..31d84e78129b 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -198,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
198 struct btrfs_key key; 198 struct btrfs_key key;
199 int ins_len = mod < 0 ? -1 : 0; 199 int ins_len = mod < 0 ? -1 : 0;
200 int cow = mod != 0; 200 int cow = mod != 0;
201 struct btrfs_key found_key;
202 struct extent_buffer *leaf;
203 201
204 key.objectid = dir; 202 key.objectid = dir;
205 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 203 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -209,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
209 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 207 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
210 if (ret < 0) 208 if (ret < 0)
211 return ERR_PTR(ret); 209 return ERR_PTR(ret);
212 if (ret > 0) { 210 if (ret > 0)
213 if (path->slots[0] == 0)
214 return NULL;
215 path->slots[0]--;
216 }
217
218 leaf = path->nodes[0];
219 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
220
221 if (found_key.objectid != dir ||
222 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
223 found_key.offset != key.offset)
224 return NULL; 211 return NULL;
225 212
226 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -315,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
315 struct btrfs_key key; 302 struct btrfs_key key;
316 int ins_len = mod < 0 ? -1 : 0; 303 int ins_len = mod < 0 ? -1 : 0;
317 int cow = mod != 0; 304 int cow = mod != 0;
318 struct btrfs_key found_key;
319 struct extent_buffer *leaf;
320 305
321 key.objectid = dir; 306 key.objectid = dir;
322 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 307 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -324,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
324 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 309 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
325 if (ret < 0) 310 if (ret < 0)
326 return ERR_PTR(ret); 311 return ERR_PTR(ret);
327 if (ret > 0) { 312 if (ret > 0)
328 if (path->slots[0] == 0)
329 return NULL;
330 path->slots[0]--;
331 }
332
333 leaf = path->nodes[0];
334 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
335
336 if (found_key.objectid != dir ||
337 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
338 found_key.offset != key.offset)
339 return NULL; 313 return NULL;
340 314
341 return btrfs_match_dir_item_name(root, path, name, name_len); 315 return btrfs_match_dir_item_name(root, path, name, name_len);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4d08ed79405d..66bac226944e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -663,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
663 struct btrfs_path *path; 663 struct btrfs_path *path;
664 664
665 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
666 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
667 key.objectid = start; 669 key.objectid = start;
668 key.offset = len; 670 key.offset = len;
669 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 671 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -3272,6 +3274,9 @@ again:
3272 } 3274 }
3273 3275
3274 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3276 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3277 if (ret < 0 && ret != -ENOSPC)
3278 goto out;
3279
3275 spin_lock(&space_info->lock); 3280 spin_lock(&space_info->lock);
3276 if (ret) 3281 if (ret)
3277 space_info->full = 1; 3282 space_info->full = 1;
@@ -3281,6 +3286,7 @@ again:
3281 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3286 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3282 space_info->chunk_alloc = 0; 3287 space_info->chunk_alloc = 0;
3283 spin_unlock(&space_info->lock); 3288 spin_unlock(&space_info->lock);
3289out:
3284 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3290 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3285 return ret; 3291 return ret;
3286} 3292}
@@ -4456,7 +4462,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4456 printk(KERN_ERR "umm, got %d back from search" 4462 printk(KERN_ERR "umm, got %d back from search"
4457 ", was looking for %llu\n", ret, 4463 ", was looking for %llu\n", ret,
4458 (unsigned long long)bytenr); 4464 (unsigned long long)bytenr);
4459 btrfs_print_leaf(extent_root, path->nodes[0]); 4465 if (ret > 0)
4466 btrfs_print_leaf(extent_root,
4467 path->nodes[0]);
4460 } 4468 }
4461 BUG_ON(ret); 4469 BUG_ON(ret);
4462 extent_slot = path->slots[0]; 4470 extent_slot = path->slots[0];
@@ -5073,7 +5081,9 @@ have_block_group:
5073 * group is does point to and try again 5081 * group is does point to and try again
5074 */ 5082 */
5075 if (!last_ptr_loop && last_ptr->block_group && 5083 if (!last_ptr_loop && last_ptr->block_group &&
5076 last_ptr->block_group != block_group) { 5084 last_ptr->block_group != block_group &&
5085 index <=
5086 get_block_group_index(last_ptr->block_group)) {
5077 5087
5078 btrfs_put_block_group(block_group); 5088 btrfs_put_block_group(block_group);
5079 block_group = last_ptr->block_group; 5089 block_group = last_ptr->block_group;
@@ -5501,7 +5511,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5501 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 5511 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5502 5512
5503 path = btrfs_alloc_path(); 5513 path = btrfs_alloc_path();
5504 BUG_ON(!path); 5514 if (!path)
5515 return -ENOMEM;
5505 5516
5506 path->leave_spinning = 1; 5517 path->leave_spinning = 1;
5507 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5518 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -6272,10 +6283,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6272 int level; 6283 int level;
6273 6284
6274 path = btrfs_alloc_path(); 6285 path = btrfs_alloc_path();
6275 BUG_ON(!path); 6286 if (!path)
6287 return -ENOMEM;
6276 6288
6277 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6289 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6278 BUG_ON(!wc); 6290 if (!wc) {
6291 btrfs_free_path(path);
6292 return -ENOMEM;
6293 }
6279 6294
6280 trans = btrfs_start_transaction(tree_root, 0); 6295 trans = btrfs_start_transaction(tree_root, 0);
6281 BUG_ON(IS_ERR(trans)); 6296 BUG_ON(IS_ERR(trans));
@@ -6538,8 +6553,6 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6538 u64 min_allocable_bytes; 6553 u64 min_allocable_bytes;
6539 int ret = -ENOSPC; 6554 int ret = -ENOSPC;
6540 6555
6541 if (cache->ro)
6542 return 0;
6543 6556
6544 /* 6557 /*
6545 * We need some metadata space and system metadata space for 6558 * We need some metadata space and system metadata space for
@@ -6555,6 +6568,12 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6555 6568
6556 spin_lock(&sinfo->lock); 6569 spin_lock(&sinfo->lock);
6557 spin_lock(&cache->lock); 6570 spin_lock(&cache->lock);
6571
6572 if (cache->ro) {
6573 ret = 0;
6574 goto out;
6575 }
6576
6558 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 6577 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6559 cache->bytes_super - btrfs_block_group_used(&cache->item); 6578 cache->bytes_super - btrfs_block_group_used(&cache->item);
6560 6579
@@ -6568,7 +6587,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6568 cache->ro = 1; 6587 cache->ro = 1;
6569 ret = 0; 6588 ret = 0;
6570 } 6589 }
6571 6590out:
6572 spin_unlock(&cache->lock); 6591 spin_unlock(&cache->lock);
6573 spin_unlock(&sinfo->lock); 6592 spin_unlock(&sinfo->lock);
6574 return ret; 6593 return ret;
@@ -7183,11 +7202,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7183 spin_unlock(&cluster->refill_lock); 7202 spin_unlock(&cluster->refill_lock);
7184 7203
7185 path = btrfs_alloc_path(); 7204 path = btrfs_alloc_path();
7186 BUG_ON(!path); 7205 if (!path) {
7206 ret = -ENOMEM;
7207 goto out;
7208 }
7187 7209
7188 inode = lookup_free_space_inode(root, block_group, path); 7210 inode = lookup_free_space_inode(root, block_group, path);
7189 if (!IS_ERR(inode)) { 7211 if (!IS_ERR(inode)) {
7190 btrfs_orphan_add(trans, inode); 7212 ret = btrfs_orphan_add(trans, inode);
7213 BUG_ON(ret);
7191 clear_nlink(inode); 7214 clear_nlink(inode);
7192 /* One for the block groups ref */ 7215 /* One for the block groups ref */
7193 spin_lock(&block_group->lock); 7216 spin_lock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 067b1747421b..d418164a35f1 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
254 * 254 *
255 * This should be called with the tree lock held. 255 * This should be called with the tree lock held.
256 */ 256 */
257static int merge_state(struct extent_io_tree *tree, 257static void merge_state(struct extent_io_tree *tree,
258 struct extent_state *state) 258 struct extent_state *state)
259{ 259{
260 struct extent_state *other; 260 struct extent_state *other;
261 struct rb_node *other_node; 261 struct rb_node *other_node;
262 262
263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264 return 0; 264 return;
265 265
266 other_node = rb_prev(&state->rb_node); 266 other_node = rb_prev(&state->rb_node);
267 if (other_node) { 267 if (other_node) {
@@ -287,19 +287,13 @@ static int merge_state(struct extent_io_tree *tree,
287 free_extent_state(other); 287 free_extent_state(other);
288 } 288 }
289 } 289 }
290
291 return 0;
292} 290}
293 291
294static int set_state_cb(struct extent_io_tree *tree, 292static void set_state_cb(struct extent_io_tree *tree,
295 struct extent_state *state, int *bits) 293 struct extent_state *state, int *bits)
296{ 294{
297 if (tree->ops && tree->ops->set_bit_hook) { 295 if (tree->ops && tree->ops->set_bit_hook)
298 return tree->ops->set_bit_hook(tree->mapping->host, 296 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
299 state, bits);
300 }
301
302 return 0;
303} 297}
304 298
305static void clear_state_cb(struct extent_io_tree *tree, 299static void clear_state_cb(struct extent_io_tree *tree,
@@ -309,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
309 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 303 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
310} 304}
311 305
306static void set_state_bits(struct extent_io_tree *tree,
307 struct extent_state *state, int *bits);
308
312/* 309/*
313 * insert an extent_state struct into the tree. 'bits' are set on the 310 * insert an extent_state struct into the tree. 'bits' are set on the
314 * struct before it is inserted. 311 * struct before it is inserted.
@@ -324,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree,
324 int *bits) 321 int *bits)
325{ 322{
326 struct rb_node *node; 323 struct rb_node *node;
327 int bits_to_set = *bits & ~EXTENT_CTLBITS;
328 int ret;
329 324
330 if (end < start) { 325 if (end < start) {
331 printk(KERN_ERR "btrfs end < start %llu %llu\n", 326 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -335,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree,
335 } 330 }
336 state->start = start; 331 state->start = start;
337 state->end = end; 332 state->end = end;
338 ret = set_state_cb(tree, state, bits);
339 if (ret)
340 return ret;
341 333
342 if (bits_to_set & EXTENT_DIRTY) 334 set_state_bits(tree, state, bits);
343 tree->dirty_bytes += end - start + 1; 335
344 state->state |= bits_to_set;
345 node = tree_insert(&tree->state, end, &state->rb_node); 336 node = tree_insert(&tree->state, end, &state->rb_node);
346 if (node) { 337 if (node) {
347 struct extent_state *found; 338 struct extent_state *found;
@@ -357,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree,
357 return 0; 348 return 0;
358} 349}
359 350
360static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 351static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
361 u64 split) 352 u64 split)
362{ 353{
363 if (tree->ops && tree->ops->split_extent_hook) 354 if (tree->ops && tree->ops->split_extent_hook)
364 return tree->ops->split_extent_hook(tree->mapping->host, 355 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
365 orig, split);
366 return 0;
367} 356}
368 357
369/* 358/*
@@ -659,34 +648,25 @@ again:
659 if (start > end) 648 if (start > end)
660 break; 649 break;
661 650
662 if (need_resched()) { 651 cond_resched_lock(&tree->lock);
663 spin_unlock(&tree->lock);
664 cond_resched();
665 spin_lock(&tree->lock);
666 }
667 } 652 }
668out: 653out:
669 spin_unlock(&tree->lock); 654 spin_unlock(&tree->lock);
670 return 0; 655 return 0;
671} 656}
672 657
673static int set_state_bits(struct extent_io_tree *tree, 658static void set_state_bits(struct extent_io_tree *tree,
674 struct extent_state *state, 659 struct extent_state *state,
675 int *bits) 660 int *bits)
676{ 661{
677 int ret;
678 int bits_to_set = *bits & ~EXTENT_CTLBITS; 662 int bits_to_set = *bits & ~EXTENT_CTLBITS;
679 663
680 ret = set_state_cb(tree, state, bits); 664 set_state_cb(tree, state, bits);
681 if (ret)
682 return ret;
683 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 665 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
684 u64 range = state->end - state->start + 1; 666 u64 range = state->end - state->start + 1;
685 tree->dirty_bytes += range; 667 tree->dirty_bytes += range;
686 } 668 }
687 state->state |= bits_to_set; 669 state->state |= bits_to_set;
688
689 return 0;
690} 670}
691 671
692static void cache_state(struct extent_state *state, 672static void cache_state(struct extent_state *state,
@@ -779,9 +759,7 @@ hit_next:
779 goto out; 759 goto out;
780 } 760 }
781 761
782 err = set_state_bits(tree, state, &bits); 762 set_state_bits(tree, state, &bits);
783 if (err)
784 goto out;
785 763
786 cache_state(state, cached_state); 764 cache_state(state, cached_state);
787 merge_state(tree, state); 765 merge_state(tree, state);
@@ -830,9 +808,7 @@ hit_next:
830 if (err) 808 if (err)
831 goto out; 809 goto out;
832 if (state->end <= end) { 810 if (state->end <= end) {
833 err = set_state_bits(tree, state, &bits); 811 set_state_bits(tree, state, &bits);
834 if (err)
835 goto out;
836 cache_state(state, cached_state); 812 cache_state(state, cached_state);
837 merge_state(tree, state); 813 merge_state(tree, state);
838 if (last_end == (u64)-1) 814 if (last_end == (u64)-1)
@@ -893,11 +869,7 @@ hit_next:
893 err = split_state(tree, state, prealloc, end + 1); 869 err = split_state(tree, state, prealloc, end + 1);
894 BUG_ON(err == -EEXIST); 870 BUG_ON(err == -EEXIST);
895 871
896 err = set_state_bits(tree, prealloc, &bits); 872 set_state_bits(tree, prealloc, &bits);
897 if (err) {
898 prealloc = NULL;
899 goto out;
900 }
901 cache_state(prealloc, cached_state); 873 cache_state(prealloc, cached_state);
902 merge_state(tree, prealloc); 874 merge_state(tree, prealloc);
903 prealloc = NULL; 875 prealloc = NULL;
@@ -1059,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1059 return 0; 1031 return 0;
1060} 1032}
1061 1033
1062/*
1063 * find the first offset in the io tree with 'bits' set. zero is
1064 * returned if we find something, and *start_ret and *end_ret are
1065 * set to reflect the state struct that was found.
1066 *
1067 * If nothing was found, 1 is returned, < 0 on error
1068 */
1069int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1070 u64 *start_ret, u64 *end_ret, int bits)
1071{
1072 struct rb_node *node;
1073 struct extent_state *state;
1074 int ret = 1;
1075
1076 spin_lock(&tree->lock);
1077 /*
1078 * this search will find all the extents that end after
1079 * our range starts.
1080 */
1081 node = tree_search(tree, start);
1082 if (!node)
1083 goto out;
1084
1085 while (1) {
1086 state = rb_entry(node, struct extent_state, rb_node);
1087 if (state->end >= start && (state->state & bits)) {
1088 *start_ret = state->start;
1089 *end_ret = state->end;
1090 ret = 0;
1091 break;
1092 }
1093 node = rb_next(node);
1094 if (!node)
1095 break;
1096 }
1097out:
1098 spin_unlock(&tree->lock);
1099 return ret;
1100}
1101
1102/* find the first state struct with 'bits' set after 'start', and 1034/* find the first state struct with 'bits' set after 'start', and
1103 * return it. tree->lock must be held. NULL will returned if 1035 * return it. tree->lock must be held. NULL will returned if
1104 * nothing was found after 'start' 1036 * nothing was found after 'start'
@@ -1131,6 +1063,30 @@ out:
1131} 1063}
1132 1064
1133/* 1065/*
1066 * find the first offset in the io tree with 'bits' set. zero is
1067 * returned if we find something, and *start_ret and *end_ret are
1068 * set to reflect the state struct that was found.
1069 *
1070 * If nothing was found, 1 is returned, < 0 on error
1071 */
1072int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1073 u64 *start_ret, u64 *end_ret, int bits)
1074{
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 state = find_first_extent_bit_state(tree, start, bits);
1080 if (state) {
1081 *start_ret = state->start;
1082 *end_ret = state->end;
1083 ret = 0;
1084 }
1085 spin_unlock(&tree->lock);
1086 return ret;
1087}
1088
1089/*
1134 * find a contiguous range of bytes in the file marked as delalloc, not 1090 * find a contiguous range of bytes in the file marked as delalloc, not
1135 * more than 'max_bytes'. start and end are used to return the range, 1091 * more than 'max_bytes'. start and end are used to return the range,
1136 * 1092 *
@@ -2546,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2546 struct writeback_control *wbc) 2502 struct writeback_control *wbc)
2547{ 2503{
2548 int ret; 2504 int ret;
2549 struct address_space *mapping = page->mapping;
2550 struct extent_page_data epd = { 2505 struct extent_page_data epd = {
2551 .bio = NULL, 2506 .bio = NULL,
2552 .tree = tree, 2507 .tree = tree,
@@ -2554,17 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2554 .extent_locked = 0, 2509 .extent_locked = 0,
2555 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2510 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2556 }; 2511 };
2557 struct writeback_control wbc_writepages = {
2558 .sync_mode = wbc->sync_mode,
2559 .nr_to_write = 64,
2560 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2561 .range_end = (loff_t)-1,
2562 };
2563 2512
2564 ret = __extent_writepage(page, wbc, &epd); 2513 ret = __extent_writepage(page, wbc, &epd);
2565 2514
2566 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2567 __extent_writepage, &epd, flush_write_bio);
2568 flush_epd_write_bio(&epd); 2515 flush_epd_write_bio(&epd);
2569 return ret; 2516 return ret;
2570} 2517}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 21a7ca9e7282..7b2f0c3e7929 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -76,15 +76,15 @@ struct extent_io_ops {
76 struct extent_state *state); 76 struct extent_state *state);
77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
78 struct extent_state *state, int uptodate); 78 struct extent_state *state, int uptodate);
79 int (*set_bit_hook)(struct inode *inode, struct extent_state *state, 79 void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
80 int *bits); 80 int *bits);
81 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 81 void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
82 int *bits); 82 int *bits);
83 int (*merge_extent_hook)(struct inode *inode, 83 void (*merge_extent_hook)(struct inode *inode,
84 struct extent_state *new, 84 struct extent_state *new,
85 struct extent_state *other); 85 struct extent_state *other);
86 int (*split_extent_hook)(struct inode *inode, 86 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 87 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 88 int (*write_cache_pages_lock_hook)(struct page *page);
89}; 89};
90 90
@@ -108,8 +108,6 @@ struct extent_state {
108 wait_queue_head_t wq; 108 wait_queue_head_t wq;
109 atomic_t refs; 109 atomic_t refs;
110 unsigned long state; 110 unsigned long state;
111 u64 split_start;
112 u64 split_end;
113 111
114 /* for use by the FS */ 112 /* for use by the FS */
115 u64 private; 113 u64 private;
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2d0410344ea3..7c97b3301459 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183 return 0; 183 return 0;
184} 184}
185 185
186int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 186static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
187{ 187{
188 int ret = 0;
189 struct extent_map *merge = NULL; 188 struct extent_map *merge = NULL;
190 struct rb_node *rb; 189 struct rb_node *rb;
191 struct extent_map *em;
192
193 write_lock(&tree->lock);
194 em = lookup_extent_mapping(tree, start, len);
195
196 WARN_ON(!em || em->start != start);
197
198 if (!em)
199 goto out;
200
201 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
202 190
203 if (em->start != 0) { 191 if (em->start != 0) {
204 rb = rb_prev(&em->rb_node); 192 rb = rb_prev(&em->rb_node);
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
225 merge->in_tree = 0; 213 merge->in_tree = 0;
226 free_extent_map(merge); 214 free_extent_map(merge);
227 } 215 }
216}
217
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
219{
220 int ret = 0;
221 struct extent_map *em;
222
223 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len);
225
226 WARN_ON(!em || em->start != start);
227
228 if (!em)
229 goto out;
230
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
232
233 try_merge_map(tree, em);
228 234
229 free_extent_map(em); 235 free_extent_map(em);
230out: 236out:
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
247 struct extent_map *em) 253 struct extent_map *em)
248{ 254{
249 int ret = 0; 255 int ret = 0;
250 struct extent_map *merge = NULL;
251 struct rb_node *rb; 256 struct rb_node *rb;
252 struct extent_map *exist; 257 struct extent_map *exist;
253 258
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
263 goto out; 268 goto out;
264 } 269 }
265 atomic_inc(&em->refs); 270 atomic_inc(&em->refs);
266 if (em->start != 0) { 271
267 rb = rb_prev(&em->rb_node); 272 try_merge_map(tree, em);
268 if (rb)
269 merge = rb_entry(rb, struct extent_map, rb_node);
270 if (rb && mergable_maps(merge, em)) {
271 em->start = merge->start;
272 em->len += merge->len;
273 em->block_len += merge->block_len;
274 em->block_start = merge->block_start;
275 merge->in_tree = 0;
276 rb_erase(&merge->rb_node, &tree->map);
277 free_extent_map(merge);
278 }
279 }
280 rb = rb_next(&em->rb_node);
281 if (rb)
282 merge = rb_entry(rb, struct extent_map, rb_node);
283 if (rb && mergable_maps(em, merge)) {
284 em->len += merge->len;
285 em->block_len += merge->len;
286 rb_erase(&merge->rb_node, &tree->map);
287 merge->in_tree = 0;
288 free_extent_map(merge);
289 }
290out: 273out:
291 return ret; 274 return ret;
292} 275}
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len)
299 return start + len; 282 return start + len;
300} 283}
301 284
302/** 285struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
303 * lookup_extent_mapping - lookup extent_map 286 u64 start, u64 len, int strict)
304 * @tree: tree to lookup in
305 * @start: byte offset to start the search
306 * @len: length of the lookup range
307 *
308 * Find and return the first extent_map struct in @tree that intersects the
309 * [start, len] range. There may be additional objects in the tree that
310 * intersect, so check the object returned carefully to make sure that no
311 * additional lookups are needed.
312 */
313struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
314 u64 start, u64 len)
315{ 287{
316 struct extent_map *em; 288 struct extent_map *em;
317 struct rb_node *rb_node; 289 struct rb_node *rb_node;
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
320 u64 end = range_end(start, len); 292 u64 end = range_end(start, len);
321 293
322 rb_node = __tree_search(&tree->map, start, &prev, &next); 294 rb_node = __tree_search(&tree->map, start, &prev, &next);
323 if (!rb_node && prev) {
324 em = rb_entry(prev, struct extent_map, rb_node);
325 if (end > em->start && start < extent_map_end(em))
326 goto found;
327 }
328 if (!rb_node && next) {
329 em = rb_entry(next, struct extent_map, rb_node);
330 if (end > em->start && start < extent_map_end(em))
331 goto found;
332 }
333 if (!rb_node) { 295 if (!rb_node) {
334 em = NULL; 296 if (prev)
335 goto out; 297 rb_node = prev;
336 } 298 else if (next)
337 if (IS_ERR(rb_node)) { 299 rb_node = next;
338 em = ERR_CAST(rb_node); 300 else
339 goto out; 301 return NULL;
340 } 302 }
303
341 em = rb_entry(rb_node, struct extent_map, rb_node); 304 em = rb_entry(rb_node, struct extent_map, rb_node);
342 if (end > em->start && start < extent_map_end(em))
343 goto found;
344 305
345 em = NULL; 306 if (strict && !(end > em->start && start < extent_map_end(em)))
346 goto out; 307 return NULL;
347 308
348found:
349 atomic_inc(&em->refs); 309 atomic_inc(&em->refs);
350out:
351 return em; 310 return em;
352} 311}
353 312
354/** 313/**
314 * lookup_extent_mapping - lookup extent_map
315 * @tree: tree to lookup in
316 * @start: byte offset to start the search
317 * @len: length of the lookup range
318 *
319 * Find and return the first extent_map struct in @tree that intersects the
320 * [start, len] range. There may be additional objects in the tree that
321 * intersect, so check the object returned carefully to make sure that no
322 * additional lookups are needed.
323 */
324struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
325 u64 start, u64 len)
326{
327 return __lookup_extent_mapping(tree, start, len, 1);
328}
329
330/**
355 * search_extent_mapping - find a nearby extent map 331 * search_extent_mapping - find a nearby extent map
356 * @tree: tree to lookup in 332 * @tree: tree to lookup in
357 * @start: byte offset to start the search 333 * @start: byte offset to start the search
@@ -365,38 +341,7 @@ out:
365struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 341struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
366 u64 start, u64 len) 342 u64 start, u64 len)
367{ 343{
368 struct extent_map *em; 344 return __lookup_extent_mapping(tree, start, len, 0);
369 struct rb_node *rb_node;
370 struct rb_node *prev = NULL;
371 struct rb_node *next = NULL;
372
373 rb_node = __tree_search(&tree->map, start, &prev, &next);
374 if (!rb_node && prev) {
375 em = rb_entry(prev, struct extent_map, rb_node);
376 goto found;
377 }
378 if (!rb_node && next) {
379 em = rb_entry(next, struct extent_map, rb_node);
380 goto found;
381 }
382 if (!rb_node) {
383 em = NULL;
384 goto out;
385 }
386 if (IS_ERR(rb_node)) {
387 em = ERR_CAST(rb_node);
388 goto out;
389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node);
391 goto found;
392
393 em = NULL;
394 goto out;
395
396found:
397 atomic_inc(&em->refs);
398out:
399 return em;
400} 345}
401 346
402/** 347/**
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 08bcfa92a222..b910694f61ed 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -291,7 +291,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
291 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
292 292
293 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
294 BUG_ON(!path); 294 if (!path)
295 return -ENOMEM;
295 296
296 if (search_commit) { 297 if (search_commit) {
297 path->skip_locking = 1; 298 path->skip_locking = 1;
@@ -677,7 +678,9 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
677 btrfs_super_csum_size(&root->fs_info->super_copy); 678 btrfs_super_csum_size(&root->fs_info->super_copy);
678 679
679 path = btrfs_alloc_path(); 680 path = btrfs_alloc_path();
680 BUG_ON(!path); 681 if (!path)
682 return -ENOMEM;
683
681 sector_sum = sums->sums; 684 sector_sum = sums->sums;
682again: 685again:
683 next_offset = (u64)-1; 686 next_offset = (u64)-1;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a35e51c9f235..658d66959abe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -74,7 +74,7 @@ struct inode_defrag {
74 * If an existing record is found the defrag item you 74 * If an existing record is found the defrag item you
75 * pass in is freed 75 * pass in is freed
76 */ 76 */
77static int __btrfs_add_inode_defrag(struct inode *inode, 77static void __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag) 78 struct inode_defrag *defrag)
79{ 79{
80 struct btrfs_root *root = BTRFS_I(inode)->root; 80 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
106 BTRFS_I(inode)->in_defrag = 1; 106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p); 107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0; 109 return;
110 110
111exists: 111exists:
112 kfree(defrag); 112 kfree(defrag);
113 return 0; 113 return;
114 114
115} 115}
116 116
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
123{ 123{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag; 125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid; 126 u64 transid;
128 127
129 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 128 if (!btrfs_test_opt(root, AUTO_DEFRAG))
@@ -150,9 +149,9 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
150 149
151 spin_lock(&root->fs_info->defrag_inodes_lock); 150 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag) 151 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag); 152 __btrfs_add_inode_defrag(inode, defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock); 153 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret; 154 return 0;
156} 155}
157 156
158/* 157/*
@@ -855,7 +854,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
855 btrfs_drop_extent_cache(inode, start, end - 1, 0); 854 btrfs_drop_extent_cache(inode, start, end - 1, 0);
856 855
857 path = btrfs_alloc_path(); 856 path = btrfs_alloc_path();
858 BUG_ON(!path); 857 if (!path)
858 return -ENOMEM;
859again: 859again:
860 recow = 0; 860 recow = 0;
861 split = start; 861 split = start;
@@ -1059,7 +1059,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
1059static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1059static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1060 struct page **pages, size_t num_pages, 1060 struct page **pages, size_t num_pages,
1061 loff_t pos, unsigned long first_index, 1061 loff_t pos, unsigned long first_index,
1062 unsigned long last_index, size_t write_bytes) 1062 size_t write_bytes)
1063{ 1063{
1064 struct extent_state *cached_state = NULL; 1064 struct extent_state *cached_state = NULL;
1065 int i; 1065 int i;
@@ -1159,7 +1159,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1159 struct btrfs_root *root = BTRFS_I(inode)->root; 1159 struct btrfs_root *root = BTRFS_I(inode)->root;
1160 struct page **pages = NULL; 1160 struct page **pages = NULL;
1161 unsigned long first_index; 1161 unsigned long first_index;
1162 unsigned long last_index;
1163 size_t num_written = 0; 1162 size_t num_written = 0;
1164 int nrptrs; 1163 int nrptrs;
1165 int ret = 0; 1164 int ret = 0;
@@ -1172,7 +1171,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1172 return -ENOMEM; 1171 return -ENOMEM;
1173 1172
1174 first_index = pos >> PAGE_CACHE_SHIFT; 1173 first_index = pos >> PAGE_CACHE_SHIFT;
1175 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1176 1174
1177 while (iov_iter_count(i) > 0) { 1175 while (iov_iter_count(i) > 0) {
1178 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1176 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -1206,8 +1204,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1206 * contents of pages from loop to loop 1204 * contents of pages from loop to loop
1207 */ 1205 */
1208 ret = prepare_pages(root, file, pages, num_pages, 1206 ret = prepare_pages(root, file, pages, num_pages,
1209 pos, first_index, last_index, 1207 pos, first_index, write_bytes);
1210 write_bytes);
1211 if (ret) { 1208 if (ret) {
1212 btrfs_delalloc_release_space(inode, 1209 btrfs_delalloc_release_space(inode,
1213 num_pages << PAGE_CACHE_SHIFT); 1210 num_pages << PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 13e6255182e3..15fceefbca0a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1061,7 +1061,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1061 u64 ino = btrfs_ino(inode); 1061 u64 ino = btrfs_ino(inode);
1062 1062
1063 path = btrfs_alloc_path(); 1063 path = btrfs_alloc_path();
1064 BUG_ON(!path); 1064 if (!path)
1065 return -ENOMEM;
1065 1066
1066 nolock = btrfs_is_free_space_inode(root, inode); 1067 nolock = btrfs_is_free_space_inode(root, inode);
1067 1068
@@ -1282,17 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1282 return ret; 1283 return ret;
1283} 1284}
1284 1285
1285static int btrfs_split_extent_hook(struct inode *inode, 1286static void btrfs_split_extent_hook(struct inode *inode,
1286 struct extent_state *orig, u64 split) 1287 struct extent_state *orig, u64 split)
1287{ 1288{
1288 /* not delalloc, ignore it */ 1289 /* not delalloc, ignore it */
1289 if (!(orig->state & EXTENT_DELALLOC)) 1290 if (!(orig->state & EXTENT_DELALLOC))
1290 return 0; 1291 return;
1291 1292
1292 spin_lock(&BTRFS_I(inode)->lock); 1293 spin_lock(&BTRFS_I(inode)->lock);
1293 BTRFS_I(inode)->outstanding_extents++; 1294 BTRFS_I(inode)->outstanding_extents++;
1294 spin_unlock(&BTRFS_I(inode)->lock); 1295 spin_unlock(&BTRFS_I(inode)->lock);
1295 return 0;
1296} 1296}
1297 1297
1298/* 1298/*
@@ -1301,18 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
1301 * extents, such as when we are doing sequential writes, so we can properly 1301 * extents, such as when we are doing sequential writes, so we can properly
1302 * account for the metadata space we'll need. 1302 * account for the metadata space we'll need.
1303 */ 1303 */
1304static int btrfs_merge_extent_hook(struct inode *inode, 1304static void btrfs_merge_extent_hook(struct inode *inode,
1305 struct extent_state *new, 1305 struct extent_state *new,
1306 struct extent_state *other) 1306 struct extent_state *other)
1307{ 1307{
1308 /* not delalloc, ignore it */ 1308 /* not delalloc, ignore it */
1309 if (!(other->state & EXTENT_DELALLOC)) 1309 if (!(other->state & EXTENT_DELALLOC))
1310 return 0; 1310 return;
1311 1311
1312 spin_lock(&BTRFS_I(inode)->lock); 1312 spin_lock(&BTRFS_I(inode)->lock);
1313 BTRFS_I(inode)->outstanding_extents--; 1313 BTRFS_I(inode)->outstanding_extents--;
1314 spin_unlock(&BTRFS_I(inode)->lock); 1314 spin_unlock(&BTRFS_I(inode)->lock);
1315 return 0;
1316} 1315}
1317 1316
1318/* 1317/*
@@ -1320,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1320 * bytes in this file, and to maintain the list of inodes that 1319 * bytes in this file, and to maintain the list of inodes that
1321 * have pending delalloc work to be done. 1320 * have pending delalloc work to be done.
1322 */ 1321 */
1323static int btrfs_set_bit_hook(struct inode *inode, 1322static void btrfs_set_bit_hook(struct inode *inode,
1324 struct extent_state *state, int *bits) 1323 struct extent_state *state, int *bits)
1325{ 1324{
1326 1325
1327 /* 1326 /*
@@ -1351,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
1351 } 1350 }
1352 spin_unlock(&root->fs_info->delalloc_lock); 1351 spin_unlock(&root->fs_info->delalloc_lock);
1353 } 1352 }
1354 return 0;
1355} 1353}
1356 1354
1357/* 1355/*
1358 * extent_io.c clear_bit_hook, see set_bit_hook for why 1356 * extent_io.c clear_bit_hook, see set_bit_hook for why
1359 */ 1357 */
1360static int btrfs_clear_bit_hook(struct inode *inode, 1358static void btrfs_clear_bit_hook(struct inode *inode,
1361 struct extent_state *state, int *bits) 1359 struct extent_state *state, int *bits)
1362{ 1360{
1363 /* 1361 /*
1364 * set_bit and clear bit hooks normally require _irqsave/restore 1362 * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1395,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1395 } 1393 }
1396 spin_unlock(&root->fs_info->delalloc_lock); 1394 spin_unlock(&root->fs_info->delalloc_lock);
1397 } 1395 }
1398 return 0;
1399} 1396}
1400 1397
1401/* 1398/*
@@ -1645,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1645 int ret; 1642 int ret;
1646 1643
1647 path = btrfs_alloc_path(); 1644 path = btrfs_alloc_path();
1648 BUG_ON(!path); 1645 if (!path)
1646 return -ENOMEM;
1649 1647
1650 path->leave_spinning = 1; 1648 path->leave_spinning = 1;
1651 1649
@@ -2215,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2215 2213
2216 if (!root->orphan_block_rsv) { 2214 if (!root->orphan_block_rsv) {
2217 block_rsv = btrfs_alloc_block_rsv(root); 2215 block_rsv = btrfs_alloc_block_rsv(root);
2218 BUG_ON(!block_rsv); 2216 if (!block_rsv)
2217 return -ENOMEM;
2219 } 2218 }
2220 2219
2221 spin_lock(&root->orphan_lock); 2220 spin_lock(&root->orphan_lock);
@@ -2517,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
2517 filled = true; 2516 filled = true;
2518 2517
2519 path = btrfs_alloc_path(); 2518 path = btrfs_alloc_path();
2520 BUG_ON(!path); 2519 if (!path)
2520 goto make_bad;
2521
2521 path->leave_spinning = 1; 2522 path->leave_spinning = 1;
2522 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2523 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2523 2524
@@ -2998,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2998 2999
2999 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3000 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3000 dentry->d_name.name, dentry->d_name.len); 3001 dentry->d_name.name, dentry->d_name.len);
3001 BUG_ON(ret); 3002 if (ret)
3003 goto out;
3002 3004
3003 if (inode->i_nlink == 0) { 3005 if (inode->i_nlink == 0) {
3004 ret = btrfs_orphan_add(trans, inode); 3006 ret = btrfs_orphan_add(trans, inode);
3005 BUG_ON(ret); 3007 if (ret)
3008 goto out;
3006 } 3009 }
3007 3010
3011out:
3008 nr = trans->blocks_used; 3012 nr = trans->blocks_used;
3009 __unlink_end_trans(trans, root); 3013 __unlink_end_trans(trans, root);
3010 btrfs_btree_balance_dirty(root, nr); 3014 btrfs_btree_balance_dirty(root, nr);
@@ -3147,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3147 3151
3148 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3152 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3149 3153
3154 path = btrfs_alloc_path();
3155 if (!path)
3156 return -ENOMEM;
3157 path->reada = -1;
3158
3150 if (root->ref_cows || root == root->fs_info->tree_root) 3159 if (root->ref_cows || root == root->fs_info->tree_root)
3151 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3160 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3152 3161
@@ -3159,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3159 if (min_type == 0 && root == BTRFS_I(inode)->root) 3168 if (min_type == 0 && root == BTRFS_I(inode)->root)
3160 btrfs_kill_delayed_inode_items(inode); 3169 btrfs_kill_delayed_inode_items(inode);
3161 3170
3162 path = btrfs_alloc_path();
3163 BUG_ON(!path);
3164 path->reada = -1;
3165
3166 key.objectid = ino; 3171 key.objectid = ino;
3167 key.offset = (u64)-1; 3172 key.offset = (u64)-1;
3168 key.type = (u8)-1; 3173 key.type = (u8)-1;
@@ -3690,7 +3695,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3690 int ret = 0; 3695 int ret = 0;
3691 3696
3692 path = btrfs_alloc_path(); 3697 path = btrfs_alloc_path();
3693 BUG_ON(!path); 3698 if (!path)
3699 return -ENOMEM;
3694 3700
3695 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3701 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3696 namelen, 0); 3702 namelen, 0);
@@ -3946,6 +3952,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3946 struct btrfs_root *root, int *new) 3952 struct btrfs_root *root, int *new)
3947{ 3953{
3948 struct inode *inode; 3954 struct inode *inode;
3955 int bad_inode = 0;
3949 3956
3950 inode = btrfs_iget_locked(s, location->objectid, root); 3957 inode = btrfs_iget_locked(s, location->objectid, root);
3951 if (!inode) 3958 if (!inode)
@@ -3955,10 +3962,19 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3955 BTRFS_I(inode)->root = root; 3962 BTRFS_I(inode)->root = root;
3956 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3963 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3957 btrfs_read_locked_inode(inode); 3964 btrfs_read_locked_inode(inode);
3958 inode_tree_add(inode); 3965 if (!is_bad_inode(inode)) {
3959 unlock_new_inode(inode); 3966 inode_tree_add(inode);
3960 if (new) 3967 unlock_new_inode(inode);
3961 *new = 1; 3968 if (new)
3969 *new = 1;
3970 } else {
3971 bad_inode = 1;
3972 }
3973 }
3974
3975 if (bad_inode) {
3976 iput(inode);
3977 inode = ERR_PTR(-ESTALE);
3962 } 3978 }
3963 3979
3964 return inode; 3980 return inode;
@@ -3993,12 +4009,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3993 struct btrfs_root *sub_root = root; 4009 struct btrfs_root *sub_root = root;
3994 struct btrfs_key location; 4010 struct btrfs_key location;
3995 int index; 4011 int index;
3996 int ret; 4012 int ret = 0;
3997 4013
3998 if (dentry->d_name.len > BTRFS_NAME_LEN) 4014 if (dentry->d_name.len > BTRFS_NAME_LEN)
3999 return ERR_PTR(-ENAMETOOLONG); 4015 return ERR_PTR(-ENAMETOOLONG);
4000 4016
4001 ret = btrfs_inode_by_name(dir, dentry, &location); 4017 if (unlikely(d_need_lookup(dentry))) {
4018 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4019 kfree(dentry->d_fsdata);
4020 dentry->d_fsdata = NULL;
4021 d_clear_need_lookup(dentry);
4022 } else {
4023 ret = btrfs_inode_by_name(dir, dentry, &location);
4024 }
4002 4025
4003 if (ret < 0) 4026 if (ret < 0)
4004 return ERR_PTR(ret); 4027 return ERR_PTR(ret);
@@ -4053,6 +4076,12 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
4053 return 0; 4076 return 0;
4054} 4077}
4055 4078
4079static void btrfs_dentry_release(struct dentry *dentry)
4080{
4081 if (dentry->d_fsdata)
4082 kfree(dentry->d_fsdata);
4083}
4084
4056static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4085static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4057 struct nameidata *nd) 4086 struct nameidata *nd)
4058{ 4087{
@@ -4075,6 +4104,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4075 struct btrfs_path *path; 4104 struct btrfs_path *path;
4076 struct list_head ins_list; 4105 struct list_head ins_list;
4077 struct list_head del_list; 4106 struct list_head del_list;
4107 struct qstr q;
4078 int ret; 4108 int ret;
4079 struct extent_buffer *leaf; 4109 struct extent_buffer *leaf;
4080 int slot; 4110 int slot;
@@ -4164,6 +4194,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4164 4194
4165 while (di_cur < di_total) { 4195 while (di_cur < di_total) {
4166 struct btrfs_key location; 4196 struct btrfs_key location;
4197 struct dentry *tmp;
4167 4198
4168 if (verify_dir_item(root, leaf, di)) 4199 if (verify_dir_item(root, leaf, di))
4169 break; 4200 break;
@@ -4184,6 +4215,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4184 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4215 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
4185 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4216 btrfs_dir_item_key_to_cpu(leaf, di, &location);
4186 4217
4218 q.name = name_ptr;
4219 q.len = name_len;
4220 q.hash = full_name_hash(q.name, q.len);
4221 tmp = d_lookup(filp->f_dentry, &q);
4222 if (!tmp) {
4223 struct btrfs_key *newkey;
4224
4225 newkey = kzalloc(sizeof(struct btrfs_key),
4226 GFP_NOFS);
4227 if (!newkey)
4228 goto no_dentry;
4229 tmp = d_alloc(filp->f_dentry, &q);
4230 if (!tmp) {
4231 kfree(newkey);
4232 dput(tmp);
4233 goto no_dentry;
4234 }
4235 memcpy(newkey, &location,
4236 sizeof(struct btrfs_key));
4237 tmp->d_fsdata = newkey;
4238 tmp->d_flags |= DCACHE_NEED_LOOKUP;
4239 d_rehash(tmp);
4240 dput(tmp);
4241 } else {
4242 dput(tmp);
4243 }
4244no_dentry:
4187 /* is this a reference to our own snapshot? If so 4245 /* is this a reference to our own snapshot? If so
4188 * skip it 4246 * skip it
4189 */ 4247 */
@@ -4409,7 +4467,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4409 int owner; 4467 int owner;
4410 4468
4411 path = btrfs_alloc_path(); 4469 path = btrfs_alloc_path();
4412 BUG_ON(!path); 4470 if (!path)
4471 return ERR_PTR(-ENOMEM);
4413 4472
4414 inode = new_inode(root->fs_info->sb); 4473 inode = new_inode(root->fs_info->sb);
4415 if (!inode) { 4474 if (!inode) {
@@ -6669,19 +6728,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6669 return 0; 6728 return 0;
6670} 6729}
6671 6730
6672/* helper function for file defrag and space balancing. This
6673 * forces readahead on a given range of bytes in an inode
6674 */
6675unsigned long btrfs_force_ra(struct address_space *mapping,
6676 struct file_ra_state *ra, struct file *file,
6677 pgoff_t offset, pgoff_t last_index)
6678{
6679 pgoff_t req_size = last_index - offset + 1;
6680
6681 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
6682 return offset + req_size;
6683}
6684
6685struct inode *btrfs_alloc_inode(struct super_block *sb) 6731struct inode *btrfs_alloc_inode(struct super_block *sb)
6686{ 6732{
6687 struct btrfs_inode *ei; 6733 struct btrfs_inode *ei;
@@ -7164,7 +7210,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7164 goto out_unlock; 7210 goto out_unlock;
7165 7211
7166 path = btrfs_alloc_path(); 7212 path = btrfs_alloc_path();
7167 BUG_ON(!path); 7213 if (!path) {
7214 err = -ENOMEM;
7215 drop_inode = 1;
7216 goto out_unlock;
7217 }
7168 key.objectid = btrfs_ino(inode); 7218 key.objectid = btrfs_ino(inode);
7169 key.offset = 0; 7219 key.offset = 0;
7170 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7220 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
@@ -7430,4 +7480,5 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7430 7480
7431const struct dentry_operations btrfs_dentry_operations = { 7481const struct dentry_operations btrfs_dentry_operations = {
7432 .d_delete = btrfs_dentry_delete, 7482 .d_delete = btrfs_dentry_delete,
7483 .d_release = btrfs_dentry_release,
7433}; 7484};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0b980afc5edd..7cf013349941 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1749,11 +1749,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1749 key.objectid = key.offset; 1749 key.objectid = key.offset;
1750 key.offset = (u64)-1; 1750 key.offset = (u64)-1;
1751 dirid = key.objectid; 1751 dirid = key.objectid;
1752
1753 } 1752 }
1754 if (ptr < name) 1753 if (ptr < name)
1755 goto out; 1754 goto out;
1756 memcpy(name, ptr, total_len); 1755 memmove(name, ptr, total_len);
1757 name[total_len]='\0'; 1756 name[total_len]='\0';
1758 ret = 0; 1757 ret = 0;
1759out: 1758out:
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index 82d569cb6267..000000000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/sort.h>
22#include "ctree.h"
23#include "ref-cache.h"
24#include "transaction.h"
25
26static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
27 struct rb_node *node)
28{
29 struct rb_node **p = &root->rb_node;
30 struct rb_node *parent = NULL;
31 struct btrfs_leaf_ref *entry;
32
33 while (*p) {
34 parent = *p;
35 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
36
37 if (bytenr < entry->bytenr)
38 p = &(*p)->rb_left;
39 else if (bytenr > entry->bytenr)
40 p = &(*p)->rb_right;
41 else
42 return parent;
43 }
44
45 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
46 rb_link_node(node, parent, p);
47 rb_insert_color(node, root);
48 return NULL;
49}
50
51static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
52{
53 struct rb_node *n = root->rb_node;
54 struct btrfs_leaf_ref *entry;
55
56 while (n) {
57 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
58 WARN_ON(!entry->in_tree);
59
60 if (bytenr < entry->bytenr)
61 n = n->rb_left;
62 else if (bytenr > entry->bytenr)
63 n = n->rb_right;
64 else
65 return n;
66 }
67 return NULL;
68}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index 24f7001f6387..000000000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52#endif
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ebe45443de06..f4099904565a 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,13 +71,12 @@ out:
71 return ret; 71 return ret;
72} 72}
73 73
74int btrfs_set_root_node(struct btrfs_root_item *item, 74void btrfs_set_root_node(struct btrfs_root_item *item,
75 struct extent_buffer *node) 75 struct extent_buffer *node)
76{ 76{
77 btrfs_set_root_bytenr(item, node->start); 77 btrfs_set_root_bytenr(item, node->start);
78 btrfs_set_root_level(item, btrfs_header_level(node)); 78 btrfs_set_root_level(item, btrfs_header_level(node));
79 btrfs_set_root_generation(item, btrfs_header_generation(node)); 79 btrfs_set_root_generation(item, btrfs_header_generation(node));
80 return 0;
81} 80}
82 81
83/* 82/*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index eb55863bb4ae..7dc36fab4afc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root)
216 spin_lock(&root->fs_info->trans_lock); 216 spin_lock(&root->fs_info->trans_lock);
217 cur_trans = root->fs_info->running_transaction; 217 cur_trans = root->fs_info->running_transaction;
218 if (cur_trans && cur_trans->blocked) { 218 if (cur_trans && cur_trans->blocked) {
219 DEFINE_WAIT(wait);
220 atomic_inc(&cur_trans->use_count); 219 atomic_inc(&cur_trans->use_count);
221 spin_unlock(&root->fs_info->trans_lock); 220 spin_unlock(&root->fs_info->trans_lock);
222 while (1) { 221
223 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 222 wait_event(root->fs_info->transaction_wait,
224 TASK_UNINTERRUPTIBLE); 223 !cur_trans->blocked);
225 if (!cur_trans->blocked)
226 break;
227 schedule();
228 }
229 finish_wait(&root->fs_info->transaction_wait, &wait);
230 put_transaction(cur_trans); 224 put_transaction(cur_trans);
231 } else { 225 } else {
232 spin_unlock(&root->fs_info->trans_lock); 226 spin_unlock(&root->fs_info->trans_lock);
@@ -357,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
357} 351}
358 352
359/* wait for a transaction commit to be fully complete */ 353/* wait for a transaction commit to be fully complete */
360static noinline int wait_for_commit(struct btrfs_root *root, 354static noinline void wait_for_commit(struct btrfs_root *root,
361 struct btrfs_transaction *commit) 355 struct btrfs_transaction *commit)
362{ 356{
363 DEFINE_WAIT(wait); 357 wait_event(commit->commit_wait, commit->commit_done);
364 while (!commit->commit_done) {
365 prepare_to_wait(&commit->commit_wait, &wait,
366 TASK_UNINTERRUPTIBLE);
367 if (commit->commit_done)
368 break;
369 schedule();
370 }
371 finish_wait(&commit->commit_wait, &wait);
372 return 0;
373} 358}
374 359
375int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 360int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -1085,22 +1070,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1085static void wait_current_trans_commit_start(struct btrfs_root *root, 1070static void wait_current_trans_commit_start(struct btrfs_root *root,
1086 struct btrfs_transaction *trans) 1071 struct btrfs_transaction *trans)
1087{ 1072{
1088 DEFINE_WAIT(wait); 1073 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
1089
1090 if (trans->in_commit)
1091 return;
1092
1093 while (1) {
1094 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1095 TASK_UNINTERRUPTIBLE);
1096 if (trans->in_commit) {
1097 finish_wait(&root->fs_info->transaction_blocked_wait,
1098 &wait);
1099 break;
1100 }
1101 schedule();
1102 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1103 }
1104} 1074}
1105 1075
1106/* 1076/*
@@ -1110,24 +1080,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
1110static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, 1080static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1111 struct btrfs_transaction *trans) 1081 struct btrfs_transaction *trans)
1112{ 1082{
1113 DEFINE_WAIT(wait); 1083 wait_event(root->fs_info->transaction_wait,
1114 1084 trans->commit_done || (trans->in_commit && !trans->blocked));
1115 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1116 return;
1117
1118 while (1) {
1119 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1120 TASK_UNINTERRUPTIBLE);
1121 if (trans->commit_done ||
1122 (trans->in_commit && !trans->blocked)) {
1123 finish_wait(&root->fs_info->transaction_wait,
1124 &wait);
1125 break;
1126 }
1127 schedule();
1128 finish_wait(&root->fs_info->transaction_wait,
1129 &wait);
1130 }
1131} 1085}
1132 1086
1133/* 1087/*
@@ -1234,8 +1188,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1234 atomic_inc(&cur_trans->use_count); 1188 atomic_inc(&cur_trans->use_count);
1235 btrfs_end_transaction(trans, root); 1189 btrfs_end_transaction(trans, root);
1236 1190
1237 ret = wait_for_commit(root, cur_trans); 1191 wait_for_commit(root, cur_trans);
1238 BUG_ON(ret);
1239 1192
1240 put_transaction(cur_trans); 1193 put_transaction(cur_trans);
1241 1194
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ac278dd83175..babee65f8eda 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1617,7 +1617,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1617 return 0; 1617 return 0;
1618 1618
1619 path = btrfs_alloc_path(); 1619 path = btrfs_alloc_path();
1620 BUG_ON(!path); 1620 if (!path)
1621 return -ENOMEM;
1621 1622
1622 nritems = btrfs_header_nritems(eb); 1623 nritems = btrfs_header_nritems(eb);
1623 for (i = 0; i < nritems; i++) { 1624 for (i = 0; i < nritems; i++) {
@@ -1723,7 +1724,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1723 return -ENOMEM; 1724 return -ENOMEM;
1724 1725
1725 if (*level == 1) { 1726 if (*level == 1) {
1726 wc->process_func(root, next, wc, ptr_gen); 1727 ret = wc->process_func(root, next, wc, ptr_gen);
1728 if (ret)
1729 return ret;
1727 1730
1728 path->slots[*level]++; 1731 path->slots[*level]++;
1729 if (wc->free) { 1732 if (wc->free) {
@@ -1788,8 +1791,11 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1788 parent = path->nodes[*level + 1]; 1791 parent = path->nodes[*level + 1];
1789 1792
1790 root_owner = btrfs_header_owner(parent); 1793 root_owner = btrfs_header_owner(parent);
1791 wc->process_func(root, path->nodes[*level], wc, 1794 ret = wc->process_func(root, path->nodes[*level], wc,
1792 btrfs_header_generation(path->nodes[*level])); 1795 btrfs_header_generation(path->nodes[*level]));
1796 if (ret)
1797 return ret;
1798
1793 if (wc->free) { 1799 if (wc->free) {
1794 struct extent_buffer *next; 1800 struct extent_buffer *next;
1795 1801
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index b89e372c7544..53875ae73ad4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1037,7 +1037,8 @@ static noinline int find_next_chunk(struct btrfs_root *root,
1037 struct btrfs_key found_key; 1037 struct btrfs_key found_key;
1038 1038
1039 path = btrfs_alloc_path(); 1039 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1040 if (!path)
1041 return -ENOMEM;
1041 1042
1042 key.objectid = objectid; 1043 key.objectid = objectid;
1043 key.offset = (u64)-1; 1044 key.offset = (u64)-1;
@@ -2061,8 +2062,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
2061 2062
2062 /* step two, relocate all the chunks */ 2063 /* step two, relocate all the chunks */
2063 path = btrfs_alloc_path(); 2064 path = btrfs_alloc_path();
2064 BUG_ON(!path); 2065 if (!path) {
2065 2066 ret = -ENOMEM;
2067 goto error;
2068 }
2066 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2069 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2067 key.offset = (u64)-1; 2070 key.offset = (u64)-1;
2068 key.type = BTRFS_CHUNK_ITEM_KEY; 2071 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -2661,7 +2664,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2661 2664
2662 ret = find_next_chunk(fs_info->chunk_root, 2665 ret = find_next_chunk(fs_info->chunk_root,
2663 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2666 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2664 BUG_ON(ret); 2667 if (ret)
2668 return ret;
2665 2669
2666 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2670 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2667 (fs_info->metadata_alloc_profile & 2671 (fs_info->metadata_alloc_profile &
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 8d8f28c94c0f..6873bb634a97 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -141,10 +141,11 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
141 141
142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP); 142 rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
143 if (rc < 0) { 143 if (rc < 0) {
144 cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", 144 cFYI(1, "%s: Failed to resolve server part of %s to IP: %d",
145 __func__, *devname, rc); 145 __func__, *devname, rc);
146 goto compose_mount_options_err; 146 goto compose_mount_options_err;
147 } 147 }
148
148 /* md_len = strlen(...) + 12 for 'sep+prefixpath=' 149 /* md_len = strlen(...) + 12 for 'sep+prefixpath='
149 * assuming that we have 'unc=' and 'ip=' in 150 * assuming that we have 'unc=' and 'ip=' in
150 * the original sb_mountdata 151 * the original sb_mountdata
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 259991bd2112..e76bfeb68267 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -87,9 +87,15 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
87 if ((cifs_pdu == NULL) || (server == NULL)) 87 if ((cifs_pdu == NULL) || (server == NULL))
88 return -EINVAL; 88 return -EINVAL;
89 89
90 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 90 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
91 server->tcpStatus == CifsNeedNegotiate)
91 return rc; 92 return rc;
92 93
94 if (!server->session_estab) {
95 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
96 return rc;
97 }
98
93 cifs_pdu->Signature.Sequence.SequenceNumber = 99 cifs_pdu->Signature.Sequence.SequenceNumber =
94 cpu_to_le32(server->sequence_number); 100 cpu_to_le32(server->sequence_number);
95 cifs_pdu->Signature.Sequence.Reserved = 0; 101 cifs_pdu->Signature.Sequence.Reserved = 0;
@@ -178,9 +184,15 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
178 if ((cifs_pdu == NULL) || (server == NULL)) 184 if ((cifs_pdu == NULL) || (server == NULL))
179 return -EINVAL; 185 return -EINVAL;
180 186
181 if ((cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) == 0) 187 if (!(cifs_pdu->Flags2 & SMBFLG2_SECURITY_SIGNATURE) ||
188 server->tcpStatus == CifsNeedNegotiate)
182 return rc; 189 return rc;
183 190
191 if (!server->session_estab) {
192 strncpy(cifs_pdu->Signature.SecuritySignature, "BSRSPYL", 8);
193 return rc;
194 }
195
184 cifs_pdu->Signature.Sequence.SequenceNumber = 196 cifs_pdu->Signature.Sequence.SequenceNumber =
185 cpu_to_le32(server->sequence_number); 197 cpu_to_le32(server->sequence_number);
186 cifs_pdu->Signature.Sequence.Reserved = 0; 198 cifs_pdu->Signature.Sequence.Reserved = 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 865517470967..f93eb948d071 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -86,24 +86,6 @@ extern mempool_t *cifs_sm_req_poolp;
86extern mempool_t *cifs_req_poolp; 86extern mempool_t *cifs_req_poolp;
87extern mempool_t *cifs_mid_poolp; 87extern mempool_t *cifs_mid_poolp;
88 88
89void
90cifs_sb_active(struct super_block *sb)
91{
92 struct cifs_sb_info *server = CIFS_SB(sb);
93
94 if (atomic_inc_return(&server->active) == 1)
95 atomic_inc(&sb->s_active);
96}
97
98void
99cifs_sb_deactive(struct super_block *sb)
100{
101 struct cifs_sb_info *server = CIFS_SB(sb);
102
103 if (atomic_dec_and_test(&server->active))
104 deactivate_super(sb);
105}
106
107static int 89static int
108cifs_read_super(struct super_block *sb) 90cifs_read_super(struct super_block *sb)
109{ 91{
@@ -581,6 +563,10 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
581 mutex_unlock(&dir->i_mutex); 563 mutex_unlock(&dir->i_mutex);
582 dput(dentry); 564 dput(dentry);
583 dentry = child; 565 dentry = child;
566 if (!dentry->d_inode) {
567 dput(dentry);
568 dentry = ERR_PTR(-ENOENT);
569 }
584 } while (!IS_ERR(dentry)); 570 } while (!IS_ERR(dentry));
585 _FreeXid(xid); 571 _FreeXid(xid);
586 kfree(full_path); 572 kfree(full_path);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index fbd050c8d52a..cb71dc1f94d1 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,10 +41,6 @@ extern struct file_system_type cifs_fs_type;
41extern const struct address_space_operations cifs_addr_ops; 41extern const struct address_space_operations cifs_addr_ops;
42extern const struct address_space_operations cifs_addr_ops_smallbuf; 42extern const struct address_space_operations cifs_addr_ops_smallbuf;
43 43
44/* Functions related to super block operations */
45extern void cifs_sb_active(struct super_block *sb);
46extern void cifs_sb_deactive(struct super_block *sb);
47
48/* Functions related to inodes */ 44/* Functions related to inodes */
49extern const struct inode_operations cifs_dir_inode_ops; 45extern const struct inode_operations cifs_dir_inode_ops;
50extern struct inode *cifs_root_iget(struct super_block *); 46extern struct inode *cifs_root_iget(struct super_block *);
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1fcf4e5b3112..38ce6d44b145 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -942,8 +942,6 @@ GLOBAL_EXTERN spinlock_t siduidlock;
942GLOBAL_EXTERN spinlock_t sidgidlock; 942GLOBAL_EXTERN spinlock_t sidgidlock;
943 943
944void cifs_oplock_break(struct work_struct *work); 944void cifs_oplock_break(struct work_struct *work);
945void cifs_oplock_break_get(struct cifsFileInfo *cfile);
946void cifs_oplock_break_put(struct cifsFileInfo *cfile);
947 945
948extern const struct slow_work_ops cifs_oplock_break_ops; 946extern const struct slow_work_ops cifs_oplock_break_ops;
949 947
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1a9fe7f816d1..aac37d99a487 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -107,7 +107,7 @@ static void mark_open_files_invalid(struct cifs_tcon *pTcon)
107static int 107static int
108cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) 108cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
109{ 109{
110 int rc = 0; 110 int rc;
111 struct cifs_ses *ses; 111 struct cifs_ses *ses;
112 struct TCP_Server_Info *server; 112 struct TCP_Server_Info *server;
113 struct nls_table *nls_codepage; 113 struct nls_table *nls_codepage;
@@ -5720,6 +5720,7 @@ CIFSSMBQAllEAs(const int xid, struct cifs_tcon *tcon,
5720 char *temp_ptr; 5720 char *temp_ptr;
5721 char *end_of_smb; 5721 char *end_of_smb;
5722 __u16 params, byte_count, data_offset; 5722 __u16 params, byte_count, data_offset;
5723 unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0;
5723 5724
5724 cFYI(1, "In Query All EAs path %s", searchName); 5725 cFYI(1, "In Query All EAs path %s", searchName);
5725QAllEAsRetry: 5726QAllEAsRetry:
@@ -5837,7 +5838,8 @@ QAllEAsRetry:
5837 } 5838 }
5838 5839
5839 if (ea_name) { 5840 if (ea_name) {
5840 if (strncmp(ea_name, temp_ptr, name_len) == 0) { 5841 if (ea_name_len == name_len &&
5842 strncmp(ea_name, temp_ptr, name_len) == 0) {
5841 temp_ptr += name_len + 1; 5843 temp_ptr += name_len + 1;
5842 rc = value_len; 5844 rc = value_len;
5843 if (buf_size == 0) 5845 if (buf_size == 0)
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e66297bad412..80c2e3add3a2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -319,25 +319,328 @@ requeue_echo:
319 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL); 319 queue_delayed_work(system_nrt_wq, &server->echo, SMB_ECHO_INTERVAL);
320} 320}
321 321
322static bool
323allocate_buffers(char **bigbuf, char **smallbuf, unsigned int size,
324 bool is_large_buf)
325{
326 char *bbuf = *bigbuf, *sbuf = *smallbuf;
327
328 if (bbuf == NULL) {
329 bbuf = (char *)cifs_buf_get();
330 if (!bbuf) {
331 cERROR(1, "No memory for large SMB response");
332 msleep(3000);
333 /* retry will check if exiting */
334 return false;
335 }
336 } else if (is_large_buf) {
337 /* we are reusing a dirty large buf, clear its start */
338 memset(bbuf, 0, size);
339 }
340
341 if (sbuf == NULL) {
342 sbuf = (char *)cifs_small_buf_get();
343 if (!sbuf) {
344 cERROR(1, "No memory for SMB response");
345 msleep(1000);
346 /* retry will check if exiting */
347 return false;
348 }
349 /* beginning of smb buffer is cleared in our buf_get */
350 } else {
351 /* if existing small buf clear beginning */
352 memset(sbuf, 0, size);
353 }
354
355 *bigbuf = bbuf;
356 *smallbuf = sbuf;
357
358 return true;
359}
360
361static int
362read_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg,
363 struct kvec *iov, unsigned int to_read,
364 unsigned int *ptotal_read, bool is_header_read)
365{
366 int length, rc = 0;
367 unsigned int total_read;
368 char *buf = iov->iov_base;
369
370 for (total_read = 0; total_read < to_read; total_read += length) {
371 length = kernel_recvmsg(server->ssocket, smb_msg, iov, 1,
372 to_read - total_read, 0);
373 if (server->tcpStatus == CifsExiting) {
374 /* then will exit */
375 rc = 2;
376 break;
377 } else if (server->tcpStatus == CifsNeedReconnect) {
378 cifs_reconnect(server);
379 /* Reconnect wakes up rspns q */
380 /* Now we will reread sock */
381 rc = 1;
382 break;
383 } else if (length == -ERESTARTSYS ||
384 length == -EAGAIN ||
385 length == -EINTR) {
386 /*
387 * Minimum sleep to prevent looping, allowing socket
388 * to clear and app threads to set tcpStatus
389 * CifsNeedReconnect if server hung.
390 */
391 usleep_range(1000, 2000);
392 length = 0;
393 if (!is_header_read)
394 continue;
395 /* Special handling for header read */
396 if (total_read) {
397 iov->iov_base = (to_read - total_read) +
398 buf;
399 iov->iov_len = to_read - total_read;
400 smb_msg->msg_control = NULL;
401 smb_msg->msg_controllen = 0;
402 rc = 3;
403 } else
404 rc = 1;
405 break;
406 } else if (length <= 0) {
407 cERROR(1, "Received no data, expecting %d",
408 to_read - total_read);
409 cifs_reconnect(server);
410 rc = 1;
411 break;
412 }
413 }
414
415 *ptotal_read = total_read;
416 return rc;
417}
418
419static bool
420check_rfc1002_header(struct TCP_Server_Info *server, char *buf)
421{
422 char temp = *buf;
423 unsigned int pdu_length = be32_to_cpu(
424 ((struct smb_hdr *)buf)->smb_buf_length);
425
426 /*
427 * The first byte big endian of the length field,
428 * is actually not part of the length but the type
429 * with the most common, zero, as regular data.
430 */
431 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
432 return false;
433 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
434 cFYI(1, "Good RFC 1002 session rsp");
435 return false;
436 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
437 /*
438 * We get this from Windows 98 instead of an error on
439 * SMB negprot response.
440 */
441 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
442 pdu_length);
443 /* give server a second to clean up */
444 msleep(1000);
445 /*
446 * Always try 445 first on reconnect since we get NACK
447 * on some if we ever connected to port 139 (the NACK
448 * is since we do not begin with RFC1001 session
449 * initialize frame).
450 */
451 cifs_set_port((struct sockaddr *)
452 &server->dstaddr, CIFS_PORT);
453 cifs_reconnect(server);
454 wake_up(&server->response_q);
455 return false;
456 } else if (temp != (char) 0) {
457 cERROR(1, "Unknown RFC 1002 frame");
458 cifs_dump_mem(" Received Data: ", buf, 4);
459 cifs_reconnect(server);
460 return false;
461 }
462
463 /* else we have an SMB response */
464 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
465 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
466 cERROR(1, "Invalid size SMB length %d pdu_length %d",
467 4, pdu_length+4);
468 cifs_reconnect(server);
469 wake_up(&server->response_q);
470 return false;
471 }
472
473 return true;
474}
475
476static struct mid_q_entry *
477find_cifs_mid(struct TCP_Server_Info *server, struct smb_hdr *buf,
478 int *length, bool is_large_buf, bool *is_multi_rsp, char **bigbuf)
479{
480 struct mid_q_entry *mid = NULL, *tmp_mid, *ret = NULL;
481
482 spin_lock(&GlobalMid_Lock);
483 list_for_each_entry_safe(mid, tmp_mid, &server->pending_mid_q, qhead) {
484 if (mid->mid != buf->Mid ||
485 mid->midState != MID_REQUEST_SUBMITTED ||
486 mid->command != buf->Command)
487 continue;
488
489 if (*length == 0 && check2ndT2(buf, server->maxBuf) > 0) {
490 /* We have a multipart transact2 resp */
491 *is_multi_rsp = true;
492 if (mid->resp_buf) {
493 /* merge response - fix up 1st*/
494 *length = coalesce_t2(buf, mid->resp_buf);
495 if (*length > 0) {
496 *length = 0;
497 mid->multiRsp = true;
498 break;
499 }
500 /* All parts received or packet is malformed. */
501 mid->multiEnd = true;
502 goto multi_t2_fnd;
503 }
504 if (!is_large_buf) {
505 /*FIXME: switch to already allocated largebuf?*/
506 cERROR(1, "1st trans2 resp needs bigbuf");
507 } else {
508 /* Have first buffer */
509 mid->resp_buf = buf;
510 mid->largeBuf = true;
511 *bigbuf = NULL;
512 }
513 break;
514 }
515 mid->resp_buf = buf;
516 mid->largeBuf = is_large_buf;
517multi_t2_fnd:
518 if (*length == 0)
519 mid->midState = MID_RESPONSE_RECEIVED;
520 else
521 mid->midState = MID_RESPONSE_MALFORMED;
522#ifdef CONFIG_CIFS_STATS2
523 mid->when_received = jiffies;
524#endif
525 list_del_init(&mid->qhead);
526 ret = mid;
527 break;
528 }
529 spin_unlock(&GlobalMid_Lock);
530
531 return ret;
532}
533
534static void clean_demultiplex_info(struct TCP_Server_Info *server)
535{
536 int length;
537
538 /* take it off the list, if it's not already */
539 spin_lock(&cifs_tcp_ses_lock);
540 list_del_init(&server->tcp_ses_list);
541 spin_unlock(&cifs_tcp_ses_lock);
542
543 spin_lock(&GlobalMid_Lock);
544 server->tcpStatus = CifsExiting;
545 spin_unlock(&GlobalMid_Lock);
546 wake_up_all(&server->response_q);
547
548 /*
549 * Check if we have blocked requests that need to free. Note that
550 * cifs_max_pending is normally 50, but can be set at module install
551 * time to as little as two.
552 */
553 spin_lock(&GlobalMid_Lock);
554 if (atomic_read(&server->inFlight) >= cifs_max_pending)
555 atomic_set(&server->inFlight, cifs_max_pending - 1);
556 /*
557 * We do not want to set the max_pending too low or we could end up
558 * with the counter going negative.
559 */
560 spin_unlock(&GlobalMid_Lock);
561 /*
562 * Although there should not be any requests blocked on this queue it
563 * can not hurt to be paranoid and try to wake up requests that may
564 * haven been blocked when more than 50 at time were on the wire to the
565 * same server - they now will see the session is in exit state and get
566 * out of SendReceive.
567 */
568 wake_up_all(&server->request_q);
569 /* give those requests time to exit */
570 msleep(125);
571
572 if (server->ssocket) {
573 sock_release(server->ssocket);
574 server->ssocket = NULL;
575 }
576
577 if (!list_empty(&server->pending_mid_q)) {
578 struct list_head dispose_list;
579 struct mid_q_entry *mid_entry;
580 struct list_head *tmp, *tmp2;
581
582 INIT_LIST_HEAD(&dispose_list);
583 spin_lock(&GlobalMid_Lock);
584 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
585 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
586 cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
587 mid_entry->midState = MID_SHUTDOWN;
588 list_move(&mid_entry->qhead, &dispose_list);
589 }
590 spin_unlock(&GlobalMid_Lock);
591
592 /* now walk dispose list and issue callbacks */
593 list_for_each_safe(tmp, tmp2, &dispose_list) {
594 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
595 cFYI(1, "Callback mid 0x%x", mid_entry->mid);
596 list_del_init(&mid_entry->qhead);
597 mid_entry->callback(mid_entry);
598 }
599 /* 1/8th of sec is more than enough time for them to exit */
600 msleep(125);
601 }
602
603 if (!list_empty(&server->pending_mid_q)) {
604 /*
605 * mpx threads have not exited yet give them at least the smb
606 * send timeout time for long ops.
607 *
608 * Due to delays on oplock break requests, we need to wait at
609 * least 45 seconds before giving up on a request getting a
610 * response and going ahead and killing cifsd.
611 */
612 cFYI(1, "Wait for exit from demultiplex thread");
613 msleep(46000);
614 /*
615 * If threads still have not exited they are probably never
616 * coming home not much else we can do but free the memory.
617 */
618 }
619
620 kfree(server->hostname);
621 kfree(server);
622
623 length = atomic_dec_return(&tcpSesAllocCount);
624 if (length > 0)
625 mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
626 GFP_KERNEL);
627}
628
322static int 629static int
323cifs_demultiplex_thread(void *p) 630cifs_demultiplex_thread(void *p)
324{ 631{
325 int length; 632 int length;
326 struct TCP_Server_Info *server = p; 633 struct TCP_Server_Info *server = p;
327 unsigned int pdu_length, total_read; 634 unsigned int pdu_length, total_read;
635 char *buf = NULL, *bigbuf = NULL, *smallbuf = NULL;
328 struct smb_hdr *smb_buffer = NULL; 636 struct smb_hdr *smb_buffer = NULL;
329 struct smb_hdr *bigbuf = NULL;
330 struct smb_hdr *smallbuf = NULL;
331 struct msghdr smb_msg; 637 struct msghdr smb_msg;
332 struct kvec iov; 638 struct kvec iov;
333 struct socket *csocket = server->ssocket;
334 struct list_head *tmp, *tmp2;
335 struct task_struct *task_to_wake = NULL; 639 struct task_struct *task_to_wake = NULL;
336 struct mid_q_entry *mid_entry; 640 struct mid_q_entry *mid_entry;
337 char temp;
338 bool isLargeBuf = false; 641 bool isLargeBuf = false;
339 bool isMultiRsp; 642 bool isMultiRsp = false;
340 int reconnect; 643 int rc;
341 644
342 current->flags |= PF_MEMALLOC; 645 current->flags |= PF_MEMALLOC;
343 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current)); 646 cFYI(1, "Demultiplex PID: %d", task_pid_nr(current));
@@ -351,35 +654,16 @@ cifs_demultiplex_thread(void *p)
351 while (server->tcpStatus != CifsExiting) { 654 while (server->tcpStatus != CifsExiting) {
352 if (try_to_freeze()) 655 if (try_to_freeze())
353 continue; 656 continue;
354 if (bigbuf == NULL) {
355 bigbuf = cifs_buf_get();
356 if (!bigbuf) {
357 cERROR(1, "No memory for large SMB response");
358 msleep(3000);
359 /* retry will check if exiting */
360 continue;
361 }
362 } else if (isLargeBuf) {
363 /* we are reusing a dirty large buf, clear its start */
364 memset(bigbuf, 0, sizeof(struct smb_hdr));
365 }
366 657
367 if (smallbuf == NULL) { 658 if (!allocate_buffers(&bigbuf, &smallbuf,
368 smallbuf = cifs_small_buf_get(); 659 sizeof(struct smb_hdr), isLargeBuf))
369 if (!smallbuf) { 660 continue;
370 cERROR(1, "No memory for SMB response");
371 msleep(1000);
372 /* retry will check if exiting */
373 continue;
374 }
375 /* beginning of smb buffer is cleared in our buf_get */
376 } else /* if existing small buf clear beginning */
377 memset(smallbuf, 0, sizeof(struct smb_hdr));
378 661
379 isLargeBuf = false; 662 isLargeBuf = false;
380 isMultiRsp = false; 663 isMultiRsp = false;
381 smb_buffer = smallbuf; 664 smb_buffer = (struct smb_hdr *)smallbuf;
382 iov.iov_base = smb_buffer; 665 buf = smallbuf;
666 iov.iov_base = buf;
383 iov.iov_len = 4; 667 iov.iov_len = 4;
384 smb_msg.msg_control = NULL; 668 smb_msg.msg_control = NULL;
385 smb_msg.msg_controllen = 0; 669 smb_msg.msg_controllen = 0;
@@ -393,158 +677,50 @@ incomplete_rcv:
393 "Reconnecting...", server->hostname, 677 "Reconnecting...", server->hostname,
394 (echo_retries * SMB_ECHO_INTERVAL / HZ)); 678 (echo_retries * SMB_ECHO_INTERVAL / HZ));
395 cifs_reconnect(server); 679 cifs_reconnect(server);
396 csocket = server->ssocket;
397 wake_up(&server->response_q); 680 wake_up(&server->response_q);
398 continue; 681 continue;
399 } 682 }
400 683
401 length = 684 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
402 kernel_recvmsg(csocket, &smb_msg, 685 &total_read, true /* header read */);
403 &iov, 1, pdu_length, 0 /* BB other flags? */); 686 if (rc == 3)
404 687 goto incomplete_rcv;
405 if (server->tcpStatus == CifsExiting) { 688 else if (rc == 2)
406 break; 689 break;
407 } else if (server->tcpStatus == CifsNeedReconnect) { 690 else if (rc == 1)
408 cFYI(1, "Reconnect after server stopped responding");
409 cifs_reconnect(server);
410 cFYI(1, "call to reconnect done");
411 csocket = server->ssocket;
412 continue;
413 } else if (length == -ERESTARTSYS ||
414 length == -EAGAIN ||
415 length == -EINTR) {
416 msleep(1); /* minimum sleep to prevent looping
417 allowing socket to clear and app threads to set
418 tcpStatus CifsNeedReconnect if server hung */
419 if (pdu_length < 4) {
420 iov.iov_base = (4 - pdu_length) +
421 (char *)smb_buffer;
422 iov.iov_len = pdu_length;
423 smb_msg.msg_control = NULL;
424 smb_msg.msg_controllen = 0;
425 goto incomplete_rcv;
426 } else
427 continue;
428 } else if (length <= 0) {
429 cFYI(1, "Reconnect after unexpected peek error %d",
430 length);
431 cifs_reconnect(server);
432 csocket = server->ssocket;
433 wake_up(&server->response_q);
434 continue; 691 continue;
435 } else if (length < pdu_length) {
436 cFYI(1, "requested %d bytes but only got %d bytes",
437 pdu_length, length);
438 pdu_length -= length;
439 msleep(1);
440 goto incomplete_rcv;
441 }
442
443 /* The right amount was read from socket - 4 bytes */
444 /* so we can now interpret the length field */
445 692
446 /* the first byte big endian of the length field, 693 /*
447 is actually not part of the length but the type 694 * The right amount was read from socket - 4 bytes,
448 with the most common, zero, as regular data */ 695 * so we can now interpret the length field.
449 temp = *((char *) smb_buffer); 696 */
450 697
451 /* Note that FC 1001 length is big endian on the wire, 698 /*
452 but we convert it here so it is always manipulated 699 * Note that RFC 1001 length is big endian on the wire,
453 as host byte order */ 700 * but we convert it here so it is always manipulated
701 * as host byte order.
702 */
454 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); 703 pdu_length = be32_to_cpu(smb_buffer->smb_buf_length);
455 704
456 cFYI(1, "rfc1002 length 0x%x", pdu_length+4); 705 cFYI(1, "rfc1002 length 0x%x", pdu_length+4);
457 706 if (!check_rfc1002_header(server, buf))
458 if (temp == (char) RFC1002_SESSION_KEEP_ALIVE) {
459 continue;
460 } else if (temp == (char)RFC1002_POSITIVE_SESSION_RESPONSE) {
461 cFYI(1, "Good RFC 1002 session rsp");
462 continue;
463 } else if (temp == (char)RFC1002_NEGATIVE_SESSION_RESPONSE) {
464 /* we get this from Windows 98 instead of
465 an error on SMB negprot response */
466 cFYI(1, "Negative RFC1002 Session Response Error 0x%x)",
467 pdu_length);
468 /* give server a second to clean up */
469 msleep(1000);
470 /* always try 445 first on reconnect since we get NACK
471 * on some if we ever connected to port 139 (the NACK
472 * is since we do not begin with RFC1001 session
473 * initialize frame)
474 */
475 cifs_set_port((struct sockaddr *)
476 &server->dstaddr, CIFS_PORT);
477 cifs_reconnect(server);
478 csocket = server->ssocket;
479 wake_up(&server->response_q);
480 continue;
481 } else if (temp != (char) 0) {
482 cERROR(1, "Unknown RFC 1002 frame");
483 cifs_dump_mem(" Received Data: ", (char *)smb_buffer,
484 length);
485 cifs_reconnect(server);
486 csocket = server->ssocket;
487 continue; 707 continue;
488 }
489
490 /* else we have an SMB response */
491 if ((pdu_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) ||
492 (pdu_length < sizeof(struct smb_hdr) - 1 - 4)) {
493 cERROR(1, "Invalid size SMB length %d pdu_length %d",
494 length, pdu_length+4);
495 cifs_reconnect(server);
496 csocket = server->ssocket;
497 wake_up(&server->response_q);
498 continue;
499 }
500 708
501 /* else length ok */ 709 /* else length ok */
502 reconnect = 0;
503
504 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) { 710 if (pdu_length > MAX_CIFS_SMALL_BUFFER_SIZE - 4) {
505 isLargeBuf = true; 711 isLargeBuf = true;
506 memcpy(bigbuf, smallbuf, 4); 712 memcpy(bigbuf, smallbuf, 4);
507 smb_buffer = bigbuf; 713 smb_buffer = (struct smb_hdr *)bigbuf;
714 buf = bigbuf;
508 } 715 }
509 length = 0; 716
510 iov.iov_base = 4 + (char *)smb_buffer; 717 iov.iov_base = 4 + buf;
511 iov.iov_len = pdu_length; 718 iov.iov_len = pdu_length;
512 for (total_read = 0; total_read < pdu_length; 719 rc = read_from_socket(server, &smb_msg, &iov, pdu_length,
513 total_read += length) { 720 &total_read, false);
514 length = kernel_recvmsg(csocket, &smb_msg, &iov, 1, 721 if (rc == 2)
515 pdu_length - total_read, 0);
516 if (server->tcpStatus == CifsExiting) {
517 /* then will exit */
518 reconnect = 2;
519 break;
520 } else if (server->tcpStatus == CifsNeedReconnect) {
521 cifs_reconnect(server);
522 csocket = server->ssocket;
523 /* Reconnect wakes up rspns q */
524 /* Now we will reread sock */
525 reconnect = 1;
526 break;
527 } else if (length == -ERESTARTSYS ||
528 length == -EAGAIN ||
529 length == -EINTR) {
530 msleep(1); /* minimum sleep to prevent looping,
531 allowing socket to clear and app
532 threads to set tcpStatus
533 CifsNeedReconnect if server hung*/
534 length = 0;
535 continue;
536 } else if (length <= 0) {
537 cERROR(1, "Received no data, expecting %d",
538 pdu_length - total_read);
539 cifs_reconnect(server);
540 csocket = server->ssocket;
541 reconnect = 1;
542 break;
543 }
544 }
545 if (reconnect == 2)
546 break; 722 break;
547 else if (reconnect == 1) 723 else if (rc == 1)
548 continue; 724 continue;
549 725
550 total_read += 4; /* account for rfc1002 hdr */ 726 total_read += 4; /* account for rfc1002 hdr */
@@ -562,75 +738,13 @@ incomplete_rcv:
562 */ 738 */
563 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read); 739 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
564 if (length != 0) 740 if (length != 0)
565 cifs_dump_mem("Bad SMB: ", smb_buffer, 741 cifs_dump_mem("Bad SMB: ", buf,
566 min_t(unsigned int, total_read, 48)); 742 min_t(unsigned int, total_read, 48));
567 743
568 mid_entry = NULL;
569 server->lstrp = jiffies; 744 server->lstrp = jiffies;
570 745
571 spin_lock(&GlobalMid_Lock); 746 mid_entry = find_cifs_mid(server, smb_buffer, &length,
572 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { 747 isLargeBuf, &isMultiRsp, &bigbuf);
573 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
574
575 if (mid_entry->mid != smb_buffer->Mid ||
576 mid_entry->midState != MID_REQUEST_SUBMITTED ||
577 mid_entry->command != smb_buffer->Command) {
578 mid_entry = NULL;
579 continue;
580 }
581
582 if (length == 0 &&
583 check2ndT2(smb_buffer, server->maxBuf) > 0) {
584 /* We have a multipart transact2 resp */
585 isMultiRsp = true;
586 if (mid_entry->resp_buf) {
587 /* merge response - fix up 1st*/
588 length = coalesce_t2(smb_buffer,
589 mid_entry->resp_buf);
590 if (length > 0) {
591 length = 0;
592 mid_entry->multiRsp = true;
593 break;
594 } else {
595 /* all parts received or
596 * packet is malformed
597 */
598 mid_entry->multiEnd = true;
599 goto multi_t2_fnd;
600 }
601 } else {
602 if (!isLargeBuf) {
603 /*
604 * FIXME: switch to already
605 * allocated largebuf?
606 */
607 cERROR(1, "1st trans2 resp "
608 "needs bigbuf");
609 } else {
610 /* Have first buffer */
611 mid_entry->resp_buf =
612 smb_buffer;
613 mid_entry->largeBuf = true;
614 bigbuf = NULL;
615 }
616 }
617 break;
618 }
619 mid_entry->resp_buf = smb_buffer;
620 mid_entry->largeBuf = isLargeBuf;
621multi_t2_fnd:
622 if (length == 0)
623 mid_entry->midState = MID_RESPONSE_RECEIVED;
624 else
625 mid_entry->midState = MID_RESPONSE_MALFORMED;
626#ifdef CONFIG_CIFS_STATS2
627 mid_entry->when_received = jiffies;
628#endif
629 list_del_init(&mid_entry->qhead);
630 break;
631 }
632 spin_unlock(&GlobalMid_Lock);
633
634 if (mid_entry != NULL) { 748 if (mid_entry != NULL) {
635 mid_entry->callback(mid_entry); 749 mid_entry->callback(mid_entry);
636 /* Was previous buf put in mpx struct for multi-rsp? */ 750 /* Was previous buf put in mpx struct for multi-rsp? */
@@ -648,7 +762,7 @@ multi_t2_fnd:
648 !isMultiRsp) { 762 !isMultiRsp) {
649 cERROR(1, "No task to wake, unknown frame received! " 763 cERROR(1, "No task to wake, unknown frame received! "
650 "NumMids %d", atomic_read(&midCount)); 764 "NumMids %d", atomic_read(&midCount));
651 cifs_dump_mem("Received Data is: ", (char *)smb_buffer, 765 cifs_dump_mem("Received Data is: ", buf,
652 sizeof(struct smb_hdr)); 766 sizeof(struct smb_hdr));
653#ifdef CONFIG_CIFS_DEBUG2 767#ifdef CONFIG_CIFS_DEBUG2
654 cifs_dump_detail(smb_buffer); 768 cifs_dump_detail(smb_buffer);
@@ -658,88 +772,13 @@ multi_t2_fnd:
658 } 772 }
659 } /* end while !EXITING */ 773 } /* end while !EXITING */
660 774
661 /* take it off the list, if it's not already */
662 spin_lock(&cifs_tcp_ses_lock);
663 list_del_init(&server->tcp_ses_list);
664 spin_unlock(&cifs_tcp_ses_lock);
665
666 spin_lock(&GlobalMid_Lock);
667 server->tcpStatus = CifsExiting;
668 spin_unlock(&GlobalMid_Lock);
669 wake_up_all(&server->response_q);
670
671 /* check if we have blocked requests that need to free */
672 /* Note that cifs_max_pending is normally 50, but
673 can be set at module install time to as little as two */
674 spin_lock(&GlobalMid_Lock);
675 if (atomic_read(&server->inFlight) >= cifs_max_pending)
676 atomic_set(&server->inFlight, cifs_max_pending - 1);
677 /* We do not want to set the max_pending too low or we
678 could end up with the counter going negative */
679 spin_unlock(&GlobalMid_Lock);
680 /* Although there should not be any requests blocked on
681 this queue it can not hurt to be paranoid and try to wake up requests
682 that may haven been blocked when more than 50 at time were on the wire
683 to the same server - they now will see the session is in exit state
684 and get out of SendReceive. */
685 wake_up_all(&server->request_q);
686 /* give those requests time to exit */
687 msleep(125);
688
689 if (server->ssocket) {
690 sock_release(csocket);
691 server->ssocket = NULL;
692 }
693 /* buffer usually freed in free_mid - need to free it here on exit */ 775 /* buffer usually freed in free_mid - need to free it here on exit */
694 cifs_buf_release(bigbuf); 776 cifs_buf_release(bigbuf);
695 if (smallbuf) /* no sense logging a debug message if NULL */ 777 if (smallbuf) /* no sense logging a debug message if NULL */
696 cifs_small_buf_release(smallbuf); 778 cifs_small_buf_release(smallbuf);
697 779
698 if (!list_empty(&server->pending_mid_q)) {
699 struct list_head dispose_list;
700
701 INIT_LIST_HEAD(&dispose_list);
702 spin_lock(&GlobalMid_Lock);
703 list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
704 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
705 cFYI(1, "Clearing mid 0x%x", mid_entry->mid);
706 mid_entry->midState = MID_SHUTDOWN;
707 list_move(&mid_entry->qhead, &dispose_list);
708 }
709 spin_unlock(&GlobalMid_Lock);
710
711 /* now walk dispose list and issue callbacks */
712 list_for_each_safe(tmp, tmp2, &dispose_list) {
713 mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
714 cFYI(1, "Callback mid 0x%x", mid_entry->mid);
715 list_del_init(&mid_entry->qhead);
716 mid_entry->callback(mid_entry);
717 }
718 /* 1/8th of sec is more than enough time for them to exit */
719 msleep(125);
720 }
721
722 if (!list_empty(&server->pending_mid_q)) {
723 /* mpx threads have not exited yet give them
724 at least the smb send timeout time for long ops */
725 /* due to delays on oplock break requests, we need
726 to wait at least 45 seconds before giving up
727 on a request getting a response and going ahead
728 and killing cifsd */
729 cFYI(1, "Wait for exit from demultiplex thread");
730 msleep(46000);
731 /* if threads still have not exited they are probably never
732 coming home not much else we can do but free the memory */
733 }
734
735 kfree(server->hostname);
736 task_to_wake = xchg(&server->tsk, NULL); 780 task_to_wake = xchg(&server->tsk, NULL);
737 kfree(server); 781 clean_demultiplex_info(server);
738
739 length = atomic_dec_return(&tcpSesAllocCount);
740 if (length > 0)
741 mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
742 GFP_KERNEL);
743 782
744 /* if server->tsk was NULL then wait for a signal before exiting */ 783 /* if server->tsk was NULL then wait for a signal before exiting */
745 if (!task_to_wake) { 784 if (!task_to_wake) {
@@ -3193,15 +3232,9 @@ mount_fail_check:
3193 else 3232 else
3194 cifs_put_tcp_session(srvTcp); 3233 cifs_put_tcp_session(srvTcp);
3195 bdi_destroy(&cifs_sb->bdi); 3234 bdi_destroy(&cifs_sb->bdi);
3196 goto out;
3197 } 3235 }
3198 3236
3199 /* volume_info->password is freed above when existing session found
3200 (in which case it is not needed anymore) but when new sesion is created
3201 the password ptr is put in the new session structure (in which case the
3202 password will be freed at unmount time) */
3203out: 3237out:
3204 /* zero out password before freeing */
3205 FreeXid(xid); 3238 FreeXid(xid);
3206 return rc; 3239 return rc;
3207} 3240}
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 548f06230a6d..1d2d91d9bf65 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -79,8 +79,8 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
79 /* Perform the upcall */ 79 /* Perform the upcall */
80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); 80 rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL);
81 if (rc < 0) 81 if (rc < 0)
82 cERROR(1, "%s: unable to resolve: %*.*s", 82 cFYI(1, "%s: unable to resolve: %*.*s",
83 __func__, len, len, hostname); 83 __func__, len, len, hostname);
84 else 84 else
85 cFYI(1, "%s: resolved: %*.*s to %s", 85 cFYI(1, "%s: resolved: %*.*s to %s",
86 __func__, len, len, hostname, *ip_addr); 86 __func__, len, len, hostname, *ip_addr);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 378acdafa356..9f41a10523a1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -314,6 +314,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
314 } 314 }
315 spin_unlock(&cifs_file_list_lock); 315 spin_unlock(&cifs_file_list_lock);
316 316
317 cancel_work_sync(&cifs_file->oplock_break);
318
317 if (!tcon->need_reconnect && !cifs_file->invalidHandle) { 319 if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
318 int xid, rc; 320 int xid, rc;
319 321
@@ -2418,31 +2420,6 @@ void cifs_oplock_break(struct work_struct *work)
2418 cinode->clientCanCacheRead ? 1 : 0); 2420 cinode->clientCanCacheRead ? 1 : 0);
2419 cFYI(1, "Oplock release rc = %d", rc); 2421 cFYI(1, "Oplock release rc = %d", rc);
2420 } 2422 }
2421
2422 /*
2423 * We might have kicked in before is_valid_oplock_break()
2424 * finished grabbing reference for us. Make sure it's done by
2425 * waiting for cifs_file_list_lock.
2426 */
2427 spin_lock(&cifs_file_list_lock);
2428 spin_unlock(&cifs_file_list_lock);
2429
2430 cifs_oplock_break_put(cfile);
2431}
2432
2433/* must be called while holding cifs_file_list_lock */
2434void cifs_oplock_break_get(struct cifsFileInfo *cfile)
2435{
2436 cifs_sb_active(cfile->dentry->d_sb);
2437 cifsFileInfo_get(cfile);
2438}
2439
2440void cifs_oplock_break_put(struct cifsFileInfo *cfile)
2441{
2442 struct super_block *sb = cfile->dentry->d_sb;
2443
2444 cifsFileInfo_put(cfile);
2445 cifs_sb_deactive(sb);
2446} 2423}
2447 2424
2448const struct address_space_operations cifs_addr_ops = { 2425const struct address_space_operations cifs_addr_ops = {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9b018c8334fa..a7b2dcd4a53e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -764,20 +764,10 @@ char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
764 if (full_path == NULL) 764 if (full_path == NULL)
765 return full_path; 765 return full_path;
766 766
767 if (dfsplen) { 767 if (dfsplen)
768 strncpy(full_path, tcon->treeName, dfsplen); 768 strncpy(full_path, tcon->treeName, dfsplen);
769 /* switch slash direction in prepath depending on whether
770 * windows or posix style path names
771 */
772 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
773 int i;
774 for (i = 0; i < dfsplen; i++) {
775 if (full_path[i] == '\\')
776 full_path[i] = '/';
777 }
778 }
779 }
780 strncpy(full_path + dfsplen, vol->prepath, pplen); 769 strncpy(full_path + dfsplen, vol->prepath, pplen);
770 convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
781 full_path[dfsplen + pplen] = 0; /* add trailing null */ 771 full_path[dfsplen + pplen] = 0; /* add trailing null */
782 return full_path; 772 return full_path;
783} 773}
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 03a1f491d39b..7c1693392598 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -585,15 +585,8 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
585 585
586 cifs_set_oplock_level(pCifsInode, 586 cifs_set_oplock_level(pCifsInode,
587 pSMB->OplockLevel ? OPLOCK_READ : 0); 587 pSMB->OplockLevel ? OPLOCK_READ : 0);
588 /* 588 queue_work(system_nrt_wq,
589 * cifs_oplock_break_put() can't be called 589 &netfile->oplock_break);
590 * from here. Get reference after queueing
591 * succeeded. cifs_oplock_break() will
592 * synchronize using cifs_file_list_lock.
593 */
594 if (queue_work(system_nrt_wq,
595 &netfile->oplock_break))
596 cifs_oplock_break_get(netfile);
597 netfile->oplock_break_cancelled = false; 590 netfile->oplock_break_cancelled = false;
598 591
599 spin_unlock(&cifs_file_list_lock); 592 spin_unlock(&cifs_file_list_lock);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 147aa22c3c3a..c1b9c4b10739 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -362,6 +362,8 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
362 mid = AllocMidQEntry(hdr, server); 362 mid = AllocMidQEntry(hdr, server);
363 if (mid == NULL) { 363 if (mid == NULL) {
364 mutex_unlock(&server->srv_mutex); 364 mutex_unlock(&server->srv_mutex);
365 atomic_dec(&server->inFlight);
366 wake_up(&server->request_q);
365 return -ENOMEM; 367 return -ENOMEM;
366 } 368 }
367 369
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8be086e9abe4..51352de88ef1 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1003,6 +1003,7 @@ COMPATIBLE_IOCTL(PPPIOCCONNECT)
1003COMPATIBLE_IOCTL(PPPIOCDISCONN) 1003COMPATIBLE_IOCTL(PPPIOCDISCONN)
1004COMPATIBLE_IOCTL(PPPIOCATTCHAN) 1004COMPATIBLE_IOCTL(PPPIOCATTCHAN)
1005COMPATIBLE_IOCTL(PPPIOCGCHAN) 1005COMPATIBLE_IOCTL(PPPIOCGCHAN)
1006COMPATIBLE_IOCTL(PPPIOCGL2TPSTATS)
1006/* PPPOX */ 1007/* PPPOX */
1007COMPATIBLE_IOCTL(PPPOEIOCSFWD) 1008COMPATIBLE_IOCTL(PPPOEIOCSFWD)
1008COMPATIBLE_IOCTL(PPPOEIOCDFWD) 1009COMPATIBLE_IOCTL(PPPOEIOCDFWD)
diff --git a/fs/dcache.c b/fs/dcache.c
index b05aac3a8cfc..a88948b8bd17 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -301,6 +301,27 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
301 return parent; 301 return parent;
302} 302}
303 303
304/*
305 * Unhash a dentry without inserting an RCU walk barrier or checking that
306 * dentry->d_lock is locked. The caller must take care of that, if
307 * appropriate.
308 */
309static void __d_shrink(struct dentry *dentry)
310{
311 if (!d_unhashed(dentry)) {
312 struct hlist_bl_head *b;
313 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
314 b = &dentry->d_sb->s_anon;
315 else
316 b = d_hash(dentry->d_parent, dentry->d_name.hash);
317
318 hlist_bl_lock(b);
319 __hlist_bl_del(&dentry->d_hash);
320 dentry->d_hash.pprev = NULL;
321 hlist_bl_unlock(b);
322 }
323}
324
304/** 325/**
305 * d_drop - drop a dentry 326 * d_drop - drop a dentry
306 * @dentry: dentry to drop 327 * @dentry: dentry to drop
@@ -319,17 +340,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
319void __d_drop(struct dentry *dentry) 340void __d_drop(struct dentry *dentry)
320{ 341{
321 if (!d_unhashed(dentry)) { 342 if (!d_unhashed(dentry)) {
322 struct hlist_bl_head *b; 343 __d_shrink(dentry);
323 if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
324 b = &dentry->d_sb->s_anon;
325 else
326 b = d_hash(dentry->d_parent, dentry->d_name.hash);
327
328 hlist_bl_lock(b);
329 __hlist_bl_del(&dentry->d_hash);
330 dentry->d_hash.pprev = NULL;
331 hlist_bl_unlock(b);
332
333 dentry_rcuwalk_barrier(dentry); 344 dentry_rcuwalk_barrier(dentry);
334 } 345 }
335} 346}
@@ -784,6 +795,7 @@ relock:
784 795
785/** 796/**
786 * prune_dcache_sb - shrink the dcache 797 * prune_dcache_sb - shrink the dcache
798 * @sb: superblock
787 * @nr_to_scan: number of entries to try to free 799 * @nr_to_scan: number of entries to try to free
788 * 800 *
789 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 801 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
@@ -828,44 +840,24 @@ EXPORT_SYMBOL(shrink_dcache_sb);
828static void shrink_dcache_for_umount_subtree(struct dentry *dentry) 840static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
829{ 841{
830 struct dentry *parent; 842 struct dentry *parent;
831 unsigned detached = 0;
832 843
833 BUG_ON(!IS_ROOT(dentry)); 844 BUG_ON(!IS_ROOT(dentry));
834 845
835 /* detach this root from the system */
836 spin_lock(&dentry->d_lock);
837 dentry_lru_del(dentry);
838 __d_drop(dentry);
839 spin_unlock(&dentry->d_lock);
840
841 for (;;) { 846 for (;;) {
842 /* descend to the first leaf in the current subtree */ 847 /* descend to the first leaf in the current subtree */
843 while (!list_empty(&dentry->d_subdirs)) { 848 while (!list_empty(&dentry->d_subdirs))
844 struct dentry *loop;
845
846 /* this is a branch with children - detach all of them
847 * from the system in one go */
848 spin_lock(&dentry->d_lock);
849 list_for_each_entry(loop, &dentry->d_subdirs,
850 d_u.d_child) {
851 spin_lock_nested(&loop->d_lock,
852 DENTRY_D_LOCK_NESTED);
853 dentry_lru_del(loop);
854 __d_drop(loop);
855 spin_unlock(&loop->d_lock);
856 }
857 spin_unlock(&dentry->d_lock);
858
859 /* move to the first child */
860 dentry = list_entry(dentry->d_subdirs.next, 849 dentry = list_entry(dentry->d_subdirs.next,
861 struct dentry, d_u.d_child); 850 struct dentry, d_u.d_child);
862 }
863 851
864 /* consume the dentries from this leaf up through its parents 852 /* consume the dentries from this leaf up through its parents
865 * until we find one with children or run out altogether */ 853 * until we find one with children or run out altogether */
866 do { 854 do {
867 struct inode *inode; 855 struct inode *inode;
868 856
857 /* detach from the system */
858 dentry_lru_del(dentry);
859 __d_shrink(dentry);
860
869 if (dentry->d_count != 0) { 861 if (dentry->d_count != 0) {
870 printk(KERN_ERR 862 printk(KERN_ERR
871 "BUG: Dentry %p{i=%lx,n=%s}" 863 "BUG: Dentry %p{i=%lx,n=%s}"
@@ -886,14 +878,10 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
886 list_del(&dentry->d_u.d_child); 878 list_del(&dentry->d_u.d_child);
887 } else { 879 } else {
888 parent = dentry->d_parent; 880 parent = dentry->d_parent;
889 spin_lock(&parent->d_lock);
890 parent->d_count--; 881 parent->d_count--;
891 list_del(&dentry->d_u.d_child); 882 list_del(&dentry->d_u.d_child);
892 spin_unlock(&parent->d_lock);
893 } 883 }
894 884
895 detached++;
896
897 inode = dentry->d_inode; 885 inode = dentry->d_inode;
898 if (inode) { 886 if (inode) {
899 dentry->d_inode = NULL; 887 dentry->d_inode = NULL;
@@ -938,9 +926,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
938 926
939 dentry = sb->s_root; 927 dentry = sb->s_root;
940 sb->s_root = NULL; 928 sb->s_root = NULL;
941 spin_lock(&dentry->d_lock);
942 dentry->d_count--; 929 dentry->d_count--;
943 spin_unlock(&dentry->d_lock);
944 shrink_dcache_for_umount_subtree(dentry); 930 shrink_dcache_for_umount_subtree(dentry);
945 931
946 while (!hlist_bl_empty(&sb->s_anon)) { 932 while (!hlist_bl_empty(&sb->s_anon)) {
@@ -1743,7 +1729,7 @@ seqretry:
1743 */ 1729 */
1744 if (read_seqcount_retry(&dentry->d_seq, *seq)) 1730 if (read_seqcount_retry(&dentry->d_seq, *seq))
1745 goto seqretry; 1731 goto seqretry;
1746 if (parent->d_flags & DCACHE_OP_COMPARE) { 1732 if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
1747 if (parent->d_op->d_compare(parent, *inode, 1733 if (parent->d_op->d_compare(parent, *inode,
1748 dentry, i, 1734 dentry, i,
1749 tlen, tname, name)) 1735 tlen, tname, name))
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 2d0f757fda3e..c5a5855a6c44 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,8 @@
12# Kbuild - Gets included from the Kernels Makefile and build system 12# Kbuild - Gets included from the Kernels Makefile and build system
13# 13#
14 14
15exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o 15# ore module library
16obj-$(CONFIG_ORE) += ore.o
17
18exofs-y := inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o 19obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index 86194b2f799d..70bae4149291 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,6 +1,10 @@
1config ORE
2 tristate
3
1config EXOFS_FS 4config EXOFS_FS
2 tristate "exofs: OSD based file system support" 5 tristate "exofs: OSD based file system support"
3 depends on SCSI_OSD_ULD 6 depends on SCSI_OSD_ULD
7 select ORE
4 help 8 help
5 EXOFS is a file system that uses an OSD storage device, 9 EXOFS is a file system that uses an OSD storage device,
6 as its backing storage. 10 as its backing storage.
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index c965806c2821..f4e442ec7445 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -36,12 +36,9 @@
36#include <linux/fs.h> 36#include <linux/fs.h>
37#include <linux/time.h> 37#include <linux/time.h>
38#include <linux/backing-dev.h> 38#include <linux/backing-dev.h>
39#include "common.h" 39#include <scsi/osd_ore.h>
40 40
41/* FIXME: Remove once pnfs hits mainline 41#include "common.h"
42 * #include <linux/exportfs/pnfs_osd_xdr.h>
43 */
44#include "pnfs.h"
45 42
46#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) 43#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
47 44
@@ -56,27 +53,11 @@
56/* u64 has problems with printk this will cast it to unsigned long long */ 53/* u64 has problems with printk this will cast it to unsigned long long */
57#define _LLU(x) (unsigned long long)(x) 54#define _LLU(x) (unsigned long long)(x)
58 55
59struct exofs_layout {
60 osd_id s_pid; /* partition ID of file system*/
61
62 /* Our way of looking at the data_map */
63 unsigned stripe_unit;
64 unsigned mirrors_p1;
65
66 unsigned group_width;
67 u64 group_depth;
68 unsigned group_count;
69
70 enum exofs_inode_layout_gen_functions lay_func;
71
72 unsigned s_numdevs; /* Num of devices in array */
73 struct osd_dev *s_ods[0]; /* Variable length */
74};
75
76/* 56/*
77 * our extension to the in-memory superblock 57 * our extension to the in-memory superblock
78 */ 58 */
79struct exofs_sb_info { 59struct exofs_sb_info {
60 struct backing_dev_info bdi; /* register our bdi with VFS */
80 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ 61 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
81 int s_timeout; /* timeout for OSD operations */ 62 int s_timeout; /* timeout for OSD operations */
82 uint64_t s_nextid; /* highest object ID used */ 63 uint64_t s_nextid; /* highest object ID used */
@@ -84,16 +65,13 @@ struct exofs_sb_info {
84 spinlock_t s_next_gen_lock; /* spinlock for gen # update */ 65 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
85 u32 s_next_generation; /* next gen # to use */ 66 u32 s_next_generation; /* next gen # to use */
86 atomic_t s_curr_pending; /* number of pending commands */ 67 atomic_t s_curr_pending; /* number of pending commands */
87 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
88 struct backing_dev_info bdi; /* register our bdi with VFS */
89 68
90 struct pnfs_osd_data_map data_map; /* Default raid to use 69 struct pnfs_osd_data_map data_map; /* Default raid to use
91 * FIXME: Needed ? 70 * FIXME: Needed ?
92 */ 71 */
93/* struct exofs_layout dir_layout;*/ /* Default dir layout */ 72 struct ore_layout layout; /* Default files layout */
94 struct exofs_layout layout; /* Default files layout, 73 struct ore_comp one_comp; /* id & cred of partition id=0*/
95 * contains the variable osd_dev 74 struct ore_components comps; /* comps for the partition */
96 * array. Keep last */
97 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ 75 struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */
98}; 76};
99 77
@@ -107,7 +85,8 @@ struct exofs_i_info {
107 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ 85 uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/
108 uint32_t i_dir_start_lookup; /* which page to start lookup */ 86 uint32_t i_dir_start_lookup; /* which page to start lookup */
109 uint64_t i_commit_size; /* the object's written length */ 87 uint64_t i_commit_size; /* the object's written length */
110 uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ 88 struct ore_comp one_comp; /* same component for all devices */
89 struct ore_components comps; /* inode view of the device table */
111}; 90};
112 91
113static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) 92static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
@@ -115,52 +94,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
115 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; 94 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
116} 95}
117 96
118struct exofs_io_state;
119typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
120
121struct exofs_io_state {
122 struct kref kref;
123
124 void *private;
125 exofs_io_done_fn done;
126
127 struct exofs_layout *layout;
128 struct osd_obj_id obj;
129 u8 *cred;
130
131 /* Global read/write IO*/
132 loff_t offset;
133 unsigned long length;
134 void *kern_buff;
135
136 struct page **pages;
137 unsigned nr_pages;
138 unsigned pgbase;
139 unsigned pages_consumed;
140
141 /* Attributes */
142 unsigned in_attr_len;
143 struct osd_attr *in_attr;
144 unsigned out_attr_len;
145 struct osd_attr *out_attr;
146
147 /* Variable array of size numdevs */
148 unsigned numdevs;
149 struct exofs_per_dev_state {
150 struct osd_request *or;
151 struct bio *bio;
152 loff_t offset;
153 unsigned length;
154 unsigned dev;
155 } per_dev[];
156};
157
158static inline unsigned exofs_io_state_size(unsigned numdevs)
159{
160 return sizeof(struct exofs_io_state) +
161 sizeof(struct exofs_per_dev_state) * numdevs;
162}
163
164/* 97/*
165 * our inode flags 98 * our inode flags
166 */ 99 */
@@ -205,12 +138,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
205} 138}
206 139
207/* 140/*
208 * Given a layout, object_number and stripe_index return the associated global
209 * dev_index
210 */
211unsigned exofs_layout_od_id(struct exofs_layout *layout,
212 osd_id obj_no, unsigned layout_index);
213/*
214 * Maximum count of links to a file 141 * Maximum count of links to a file
215 */ 142 */
216#define EXOFS_LINK_MAX 32000 143#define EXOFS_LINK_MAX 32000
@@ -219,44 +146,8 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout,
219 * function declarations * 146 * function declarations *
220 *************************/ 147 *************************/
221 148
222/* ios.c */
223void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
224 const struct osd_obj_id *obj);
225int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
226 u64 offset, void *p, unsigned length);
227
228int exofs_get_io_state(struct exofs_layout *layout,
229 struct exofs_io_state **ios);
230void exofs_put_io_state(struct exofs_io_state *ios);
231
232int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
233
234int exofs_sbi_create(struct exofs_io_state *ios);
235int exofs_sbi_remove(struct exofs_io_state *ios);
236int exofs_sbi_write(struct exofs_io_state *ios);
237int exofs_sbi_read(struct exofs_io_state *ios);
238
239int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
240
241int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
242static inline int exofs_oi_write(struct exofs_i_info *oi,
243 struct exofs_io_state *ios)
244{
245 ios->obj.id = exofs_oi_objno(oi);
246 ios->cred = oi->i_cred;
247 return exofs_sbi_write(ios);
248}
249
250static inline int exofs_oi_read(struct exofs_i_info *oi,
251 struct exofs_io_state *ios)
252{
253 ios->obj.id = exofs_oi_objno(oi);
254 ios->cred = oi->i_cred;
255 return exofs_sbi_read(ios);
256}
257
258/* inode.c */ 149/* inode.c */
259unsigned exofs_max_io_pages(struct exofs_layout *layout, 150unsigned exofs_max_io_pages(struct ore_layout *layout,
260 unsigned expected_pages); 151 unsigned expected_pages);
261int exofs_setattr(struct dentry *, struct iattr *); 152int exofs_setattr(struct dentry *, struct iattr *);
262int exofs_write_begin(struct file *file, struct address_space *mapping, 153int exofs_write_begin(struct file *file, struct address_space *mapping,
@@ -281,6 +172,8 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
281 struct inode *); 172 struct inode *);
282 173
283/* super.c */ 174/* super.c */
175void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
176 const struct osd_obj_id *obj);
284int exofs_sbi_write_stats(struct exofs_sb_info *sbi); 177int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
285 178
286/********************* 179/*********************
@@ -295,7 +188,6 @@ extern const struct file_operations exofs_file_operations;
295 188
296/* inode.c */ 189/* inode.c */
297extern const struct address_space_operations exofs_aops; 190extern const struct address_space_operations exofs_aops;
298extern const struct osd_attr g_attr_logical_length;
299 191
300/* namei.c */ 192/* namei.c */
301extern const struct inode_operations exofs_dir_inode_operations; 193extern const struct inode_operations exofs_dir_inode_operations;
@@ -305,4 +197,33 @@ extern const struct inode_operations exofs_special_inode_operations;
305extern const struct inode_operations exofs_symlink_inode_operations; 197extern const struct inode_operations exofs_symlink_inode_operations;
306extern const struct inode_operations exofs_fast_symlink_inode_operations; 198extern const struct inode_operations exofs_fast_symlink_inode_operations;
307 199
200/* exofs_init_comps will initialize an ore_components device array
201 * pointing to a single ore_comp struct, and a round-robin view
202 * of the device table.
203 * The first device of each inode is the [inode->ino % num_devices]
204 * and the rest of the devices sequentially following where the
205 * first device is after the last device.
206 * It is assumed that the global device array at @sbi is twice
207 * bigger and that the device table repeats twice.
208 * See: exofs_read_lookup_dev_table()
209 */
210static inline void exofs_init_comps(struct ore_components *comps,
211 struct ore_comp *one_comp,
212 struct exofs_sb_info *sbi, osd_id oid)
213{
214 unsigned dev_mod = (unsigned)oid, first_dev;
215
216 one_comp->obj.partition = sbi->one_comp.obj.partition;
217 one_comp->obj.id = oid;
218 exofs_make_credential(one_comp->cred, &one_comp->obj);
219
220 comps->numdevs = sbi->comps.numdevs;
221 comps->single_comp = EC_SINGLE_COMP;
222 comps->comps = one_comp;
223
224 /* Round robin device view of the table */
225 first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->comps.numdevs;
226 comps->ods = sbi->comps.ods + first_dev;
227}
228
308#endif 229#endif
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 8472c098445d..f39a38fc2349 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,7 +43,7 @@ enum { BIO_MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *), 43 PAGE_SIZE / sizeof(struct page *),
44}; 44};
45 45
46unsigned exofs_max_io_pages(struct exofs_layout *layout, 46unsigned exofs_max_io_pages(struct ore_layout *layout,
47 unsigned expected_pages) 47 unsigned expected_pages)
48{ 48{
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC); 49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
@@ -58,7 +58,7 @@ struct page_collect {
58 struct exofs_sb_info *sbi; 58 struct exofs_sb_info *sbi;
59 struct inode *inode; 59 struct inode *inode;
60 unsigned expected_pages; 60 unsigned expected_pages;
61 struct exofs_io_state *ios; 61 struct ore_io_state *ios;
62 62
63 struct page **pages; 63 struct page **pages;
64 unsigned alloc_pages; 64 unsigned alloc_pages;
@@ -110,13 +110,6 @@ static int pcol_try_alloc(struct page_collect *pcol)
110{ 110{
111 unsigned pages; 111 unsigned pages;
112 112
113 if (!pcol->ios) { /* First time allocate io_state */
114 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
115
116 if (ret)
117 return ret;
118 }
119
120 /* TODO: easily support bio chaining */ 113 /* TODO: easily support bio chaining */
121 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); 114 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
122 115
@@ -140,7 +133,7 @@ static void pcol_free(struct page_collect *pcol)
140 pcol->pages = NULL; 133 pcol->pages = NULL;
141 134
142 if (pcol->ios) { 135 if (pcol->ios) {
143 exofs_put_io_state(pcol->ios); 136 ore_put_io_state(pcol->ios);
144 pcol->ios = NULL; 137 pcol->ios = NULL;
145 } 138 }
146} 139}
@@ -200,7 +193,7 @@ static int __readpages_done(struct page_collect *pcol)
200 u64 resid; 193 u64 resid;
201 u64 good_bytes; 194 u64 good_bytes;
202 u64 length = 0; 195 u64 length = 0;
203 int ret = exofs_check_io(pcol->ios, &resid); 196 int ret = ore_check_io(pcol->ios, &resid);
204 197
205 if (likely(!ret)) 198 if (likely(!ret))
206 good_bytes = pcol->length; 199 good_bytes = pcol->length;
@@ -241,7 +234,7 @@ static int __readpages_done(struct page_collect *pcol)
241} 234}
242 235
243/* callback of async reads */ 236/* callback of async reads */
244static void readpages_done(struct exofs_io_state *ios, void *p) 237static void readpages_done(struct ore_io_state *ios, void *p)
245{ 238{
246 struct page_collect *pcol = p; 239 struct page_collect *pcol = p;
247 240
@@ -269,20 +262,28 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
269static int read_exec(struct page_collect *pcol) 262static int read_exec(struct page_collect *pcol)
270{ 263{
271 struct exofs_i_info *oi = exofs_i(pcol->inode); 264 struct exofs_i_info *oi = exofs_i(pcol->inode);
272 struct exofs_io_state *ios = pcol->ios; 265 struct ore_io_state *ios;
273 struct page_collect *pcol_copy = NULL; 266 struct page_collect *pcol_copy = NULL;
274 int ret; 267 int ret;
275 268
276 if (!pcol->pages) 269 if (!pcol->pages)
277 return 0; 270 return 0;
278 271
272 if (!pcol->ios) {
273 int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, true,
274 pcol->pg_first << PAGE_CACHE_SHIFT,
275 pcol->length, &pcol->ios);
276
277 if (ret)
278 return ret;
279 }
280
281 ios = pcol->ios;
279 ios->pages = pcol->pages; 282 ios->pages = pcol->pages;
280 ios->nr_pages = pcol->nr_pages; 283 ios->nr_pages = pcol->nr_pages;
281 ios->length = pcol->length;
282 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
283 284
284 if (pcol->read_4_write) { 285 if (pcol->read_4_write) {
285 exofs_oi_read(oi, pcol->ios); 286 ore_read(pcol->ios);
286 return __readpages_done(pcol); 287 return __readpages_done(pcol);
287 } 288 }
288 289
@@ -295,14 +296,14 @@ static int read_exec(struct page_collect *pcol)
295 *pcol_copy = *pcol; 296 *pcol_copy = *pcol;
296 ios->done = readpages_done; 297 ios->done = readpages_done;
297 ios->private = pcol_copy; 298 ios->private = pcol_copy;
298 ret = exofs_oi_read(oi, ios); 299 ret = ore_read(ios);
299 if (unlikely(ret)) 300 if (unlikely(ret))
300 goto err; 301 goto err;
301 302
302 atomic_inc(&pcol->sbi->s_curr_pending); 303 atomic_inc(&pcol->sbi->s_curr_pending);
303 304
304 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 305 EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
305 ios->obj.id, _LLU(ios->offset), pcol->length); 306 oi->one_comp.obj.id, _LLU(ios->offset), pcol->length);
306 307
307 /* pages ownership was passed to pcol_copy */ 308 /* pages ownership was passed to pcol_copy */
308 _pcol_reset(pcol); 309 _pcol_reset(pcol);
@@ -457,14 +458,14 @@ static int exofs_readpage(struct file *file, struct page *page)
457} 458}
458 459
459/* Callback for osd_write. All writes are asynchronous */ 460/* Callback for osd_write. All writes are asynchronous */
460static void writepages_done(struct exofs_io_state *ios, void *p) 461static void writepages_done(struct ore_io_state *ios, void *p)
461{ 462{
462 struct page_collect *pcol = p; 463 struct page_collect *pcol = p;
463 int i; 464 int i;
464 u64 resid; 465 u64 resid;
465 u64 good_bytes; 466 u64 good_bytes;
466 u64 length = 0; 467 u64 length = 0;
467 int ret = exofs_check_io(ios, &resid); 468 int ret = ore_check_io(ios, &resid);
468 469
469 atomic_dec(&pcol->sbi->s_curr_pending); 470 atomic_dec(&pcol->sbi->s_curr_pending);
470 471
@@ -507,13 +508,21 @@ static void writepages_done(struct exofs_io_state *ios, void *p)
507static int write_exec(struct page_collect *pcol) 508static int write_exec(struct page_collect *pcol)
508{ 509{
509 struct exofs_i_info *oi = exofs_i(pcol->inode); 510 struct exofs_i_info *oi = exofs_i(pcol->inode);
510 struct exofs_io_state *ios = pcol->ios; 511 struct ore_io_state *ios;
511 struct page_collect *pcol_copy = NULL; 512 struct page_collect *pcol_copy = NULL;
512 int ret; 513 int ret;
513 514
514 if (!pcol->pages) 515 if (!pcol->pages)
515 return 0; 516 return 0;
516 517
518 BUG_ON(pcol->ios);
519 ret = ore_get_rw_state(&pcol->sbi->layout, &oi->comps, false,
520 pcol->pg_first << PAGE_CACHE_SHIFT,
521 pcol->length, &pcol->ios);
522
523 if (unlikely(ret))
524 goto err;
525
517 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 526 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
518 if (!pcol_copy) { 527 if (!pcol_copy) {
519 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); 528 EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n");
@@ -523,16 +532,15 @@ static int write_exec(struct page_collect *pcol)
523 532
524 *pcol_copy = *pcol; 533 *pcol_copy = *pcol;
525 534
535 ios = pcol->ios;
526 ios->pages = pcol_copy->pages; 536 ios->pages = pcol_copy->pages;
527 ios->nr_pages = pcol_copy->nr_pages; 537 ios->nr_pages = pcol_copy->nr_pages;
528 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
529 ios->length = pcol_copy->length;
530 ios->done = writepages_done; 538 ios->done = writepages_done;
531 ios->private = pcol_copy; 539 ios->private = pcol_copy;
532 540
533 ret = exofs_oi_write(oi, ios); 541 ret = ore_write(ios);
534 if (unlikely(ret)) { 542 if (unlikely(ret)) {
535 EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); 543 EXOFS_ERR("write_exec: ore_write() Failed\n");
536 goto err; 544 goto err;
537 } 545 }
538 546
@@ -844,17 +852,15 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode)
844 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); 852 return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
845} 853}
846 854
847const struct osd_attr g_attr_logical_length = ATTR_DEF(
848 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
849
850static int _do_truncate(struct inode *inode, loff_t newsize) 855static int _do_truncate(struct inode *inode, loff_t newsize)
851{ 856{
852 struct exofs_i_info *oi = exofs_i(inode); 857 struct exofs_i_info *oi = exofs_i(inode);
858 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
853 int ret; 859 int ret;
854 860
855 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 861 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
856 862
857 ret = exofs_oi_truncate(oi, (u64)newsize); 863 ret = ore_truncate(&sbi->layout, &oi->comps, (u64)newsize);
858 if (likely(!ret)) 864 if (likely(!ret))
859 truncate_setsize(inode, newsize); 865 truncate_setsize(inode, newsize);
860 866
@@ -917,30 +923,26 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
917 [1] = g_attr_inode_file_layout, 923 [1] = g_attr_inode_file_layout,
918 [2] = g_attr_inode_dir_layout, 924 [2] = g_attr_inode_dir_layout,
919 }; 925 };
920 struct exofs_io_state *ios; 926 struct ore_io_state *ios;
921 struct exofs_on_disk_inode_layout *layout; 927 struct exofs_on_disk_inode_layout *layout;
922 int ret; 928 int ret;
923 929
924 ret = exofs_get_io_state(&sbi->layout, &ios); 930 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
925 if (unlikely(ret)) { 931 if (unlikely(ret)) {
926 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 932 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
927 return ret; 933 return ret;
928 } 934 }
929 935
930 ios->obj.id = exofs_oi_objno(oi); 936 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
931 exofs_make_credential(oi->i_cred, &ios->obj); 937 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->comps.numdevs);
932 ios->cred = oi->i_cred;
933
934 attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
935 attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs);
936 938
937 ios->in_attr = attrs; 939 ios->in_attr = attrs;
938 ios->in_attr_len = ARRAY_SIZE(attrs); 940 ios->in_attr_len = ARRAY_SIZE(attrs);
939 941
940 ret = exofs_sbi_read(ios); 942 ret = ore_read(ios);
941 if (unlikely(ret)) { 943 if (unlikely(ret)) {
942 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", 944 EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n",
943 _LLU(ios->obj.id), ret); 945 _LLU(oi->one_comp.obj.id), ret);
944 memset(inode, 0, sizeof(*inode)); 946 memset(inode, 0, sizeof(*inode));
945 inode->i_mode = 0040000 | (0777 & ~022); 947 inode->i_mode = 0040000 | (0777 & ~022);
946 /* If object is lost on target we might as well enable it's 948 /* If object is lost on target we might as well enable it's
@@ -990,7 +992,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
990 } 992 }
991 993
992out: 994out:
993 exofs_put_io_state(ios); 995 ore_put_io_state(ios);
994 return ret; 996 return ret;
995} 997}
996 998
@@ -1016,6 +1018,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
1016 return inode; 1018 return inode;
1017 oi = exofs_i(inode); 1019 oi = exofs_i(inode);
1018 __oi_init(oi); 1020 __oi_init(oi);
1021 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
1022 exofs_oi_objno(oi));
1019 1023
1020 /* read the inode from the osd */ 1024 /* read the inode from the osd */
1021 ret = exofs_get_inode(sb, oi, &fcb); 1025 ret = exofs_get_inode(sb, oi, &fcb);
@@ -1107,21 +1111,22 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1107 * set the obj_created flag so that other methods know that the object exists on 1111 * set the obj_created flag so that other methods know that the object exists on
1108 * the OSD. 1112 * the OSD.
1109 */ 1113 */
1110static void create_done(struct exofs_io_state *ios, void *p) 1114static void create_done(struct ore_io_state *ios, void *p)
1111{ 1115{
1112 struct inode *inode = p; 1116 struct inode *inode = p;
1113 struct exofs_i_info *oi = exofs_i(inode); 1117 struct exofs_i_info *oi = exofs_i(inode);
1114 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1118 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1115 int ret; 1119 int ret;
1116 1120
1117 ret = exofs_check_io(ios, NULL); 1121 ret = ore_check_io(ios, NULL);
1118 exofs_put_io_state(ios); 1122 ore_put_io_state(ios);
1119 1123
1120 atomic_dec(&sbi->s_curr_pending); 1124 atomic_dec(&sbi->s_curr_pending);
1121 1125
1122 if (unlikely(ret)) { 1126 if (unlikely(ret)) {
1123 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", 1127 EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx",
1124 _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); 1128 _LLU(exofs_oi_objno(oi)),
1129 _LLU(oi->one_comp.obj.partition));
1125 /*TODO: When FS is corrupted creation can fail, object already 1130 /*TODO: When FS is corrupted creation can fail, object already
1126 * exist. Get rid of this asynchronous creation, if exist 1131 * exist. Get rid of this asynchronous creation, if exist
1127 * increment the obj counter and try the next object. Until we 1132 * increment the obj counter and try the next object. Until we
@@ -1140,14 +1145,13 @@ static void create_done(struct exofs_io_state *ios, void *p)
1140 */ 1145 */
1141struct inode *exofs_new_inode(struct inode *dir, int mode) 1146struct inode *exofs_new_inode(struct inode *dir, int mode)
1142{ 1147{
1143 struct super_block *sb; 1148 struct super_block *sb = dir->i_sb;
1149 struct exofs_sb_info *sbi = sb->s_fs_info;
1144 struct inode *inode; 1150 struct inode *inode;
1145 struct exofs_i_info *oi; 1151 struct exofs_i_info *oi;
1146 struct exofs_sb_info *sbi; 1152 struct ore_io_state *ios;
1147 struct exofs_io_state *ios;
1148 int ret; 1153 int ret;
1149 1154
1150 sb = dir->i_sb;
1151 inode = new_inode(sb); 1155 inode = new_inode(sb);
1152 if (!inode) 1156 if (!inode)
1153 return ERR_PTR(-ENOMEM); 1157 return ERR_PTR(-ENOMEM);
@@ -1157,8 +1161,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1157 1161
1158 set_obj_2bcreated(oi); 1162 set_obj_2bcreated(oi);
1159 1163
1160 sbi = sb->s_fs_info;
1161
1162 inode->i_mapping->backing_dev_info = sb->s_bdi; 1164 inode->i_mapping->backing_dev_info = sb->s_bdi;
1163 inode_init_owner(inode, dir, mode); 1165 inode_init_owner(inode, dir, mode);
1164 inode->i_ino = sbi->s_nextid++; 1166 inode->i_ino = sbi->s_nextid++;
@@ -1170,25 +1172,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1170 spin_unlock(&sbi->s_next_gen_lock); 1172 spin_unlock(&sbi->s_next_gen_lock);
1171 insert_inode_hash(inode); 1173 insert_inode_hash(inode);
1172 1174
1175 exofs_init_comps(&oi->comps, &oi->one_comp, sb->s_fs_info,
1176 exofs_oi_objno(oi));
1173 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ 1177 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1174 1178
1175 mark_inode_dirty(inode); 1179 mark_inode_dirty(inode);
1176 1180
1177 ret = exofs_get_io_state(&sbi->layout, &ios); 1181 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1178 if (unlikely(ret)) { 1182 if (unlikely(ret)) {
1179 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); 1183 EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n");
1180 return ERR_PTR(ret); 1184 return ERR_PTR(ret);
1181 } 1185 }
1182 1186
1183 ios->obj.id = exofs_oi_objno(oi);
1184 exofs_make_credential(oi->i_cred, &ios->obj);
1185
1186 ios->done = create_done; 1187 ios->done = create_done;
1187 ios->private = inode; 1188 ios->private = inode;
1188 ios->cred = oi->i_cred; 1189
1189 ret = exofs_sbi_create(ios); 1190 ret = ore_create(ios);
1190 if (ret) { 1191 if (ret) {
1191 exofs_put_io_state(ios); 1192 ore_put_io_state(ios);
1192 return ERR_PTR(ret); 1193 return ERR_PTR(ret);
1193 } 1194 }
1194 atomic_inc(&sbi->s_curr_pending); 1195 atomic_inc(&sbi->s_curr_pending);
@@ -1207,11 +1208,11 @@ struct updatei_args {
1207/* 1208/*
1208 * Callback function from exofs_update_inode(). 1209 * Callback function from exofs_update_inode().
1209 */ 1210 */
1210static void updatei_done(struct exofs_io_state *ios, void *p) 1211static void updatei_done(struct ore_io_state *ios, void *p)
1211{ 1212{
1212 struct updatei_args *args = p; 1213 struct updatei_args *args = p;
1213 1214
1214 exofs_put_io_state(ios); 1215 ore_put_io_state(ios);
1215 1216
1216 atomic_dec(&args->sbi->s_curr_pending); 1217 atomic_dec(&args->sbi->s_curr_pending);
1217 1218
@@ -1227,7 +1228,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1227 struct exofs_i_info *oi = exofs_i(inode); 1228 struct exofs_i_info *oi = exofs_i(inode);
1228 struct super_block *sb = inode->i_sb; 1229 struct super_block *sb = inode->i_sb;
1229 struct exofs_sb_info *sbi = sb->s_fs_info; 1230 struct exofs_sb_info *sbi = sb->s_fs_info;
1230 struct exofs_io_state *ios; 1231 struct ore_io_state *ios;
1231 struct osd_attr attr; 1232 struct osd_attr attr;
1232 struct exofs_fcb *fcb; 1233 struct exofs_fcb *fcb;
1233 struct updatei_args *args; 1234 struct updatei_args *args;
@@ -1266,9 +1267,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1266 } else 1267 } else
1267 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1268 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1268 1269
1269 ret = exofs_get_io_state(&sbi->layout, &ios); 1270 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1270 if (unlikely(ret)) { 1271 if (unlikely(ret)) {
1271 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 1272 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
1272 goto free_args; 1273 goto free_args;
1273 } 1274 }
1274 1275
@@ -1285,13 +1286,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1285 ios->private = args; 1286 ios->private = args;
1286 } 1287 }
1287 1288
1288 ret = exofs_oi_write(oi, ios); 1289 ret = ore_write(ios);
1289 if (!do_sync && !ret) { 1290 if (!do_sync && !ret) {
1290 atomic_inc(&sbi->s_curr_pending); 1291 atomic_inc(&sbi->s_curr_pending);
1291 goto out; /* deallocation in updatei_done */ 1292 goto out; /* deallocation in updatei_done */
1292 } 1293 }
1293 1294
1294 exofs_put_io_state(ios); 1295 ore_put_io_state(ios);
1295free_args: 1296free_args:
1296 kfree(args); 1297 kfree(args);
1297out: 1298out:
@@ -1310,11 +1311,11 @@ int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1310 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1311 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1311 * do. 1312 * do.
1312 */ 1313 */
1313static void delete_done(struct exofs_io_state *ios, void *p) 1314static void delete_done(struct ore_io_state *ios, void *p)
1314{ 1315{
1315 struct exofs_sb_info *sbi = p; 1316 struct exofs_sb_info *sbi = p;
1316 1317
1317 exofs_put_io_state(ios); 1318 ore_put_io_state(ios);
1318 1319
1319 atomic_dec(&sbi->s_curr_pending); 1320 atomic_dec(&sbi->s_curr_pending);
1320} 1321}
@@ -1329,7 +1330,7 @@ void exofs_evict_inode(struct inode *inode)
1329 struct exofs_i_info *oi = exofs_i(inode); 1330 struct exofs_i_info *oi = exofs_i(inode);
1330 struct super_block *sb = inode->i_sb; 1331 struct super_block *sb = inode->i_sb;
1331 struct exofs_sb_info *sbi = sb->s_fs_info; 1332 struct exofs_sb_info *sbi = sb->s_fs_info;
1332 struct exofs_io_state *ios; 1333 struct ore_io_state *ios;
1333 int ret; 1334 int ret;
1334 1335
1335 truncate_inode_pages(&inode->i_data, 0); 1336 truncate_inode_pages(&inode->i_data, 0);
@@ -1349,20 +1350,19 @@ void exofs_evict_inode(struct inode *inode)
1349 /* ignore the error, attempt a remove anyway */ 1350 /* ignore the error, attempt a remove anyway */
1350 1351
1351 /* Now Remove the OSD objects */ 1352 /* Now Remove the OSD objects */
1352 ret = exofs_get_io_state(&sbi->layout, &ios); 1353 ret = ore_get_io_state(&sbi->layout, &oi->comps, &ios);
1353 if (unlikely(ret)) { 1354 if (unlikely(ret)) {
1354 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); 1355 EXOFS_ERR("%s: ore_get_io_state failed\n", __func__);
1355 return; 1356 return;
1356 } 1357 }
1357 1358
1358 ios->obj.id = exofs_oi_objno(oi);
1359 ios->done = delete_done; 1359 ios->done = delete_done;
1360 ios->private = sbi; 1360 ios->private = sbi;
1361 ios->cred = oi->i_cred; 1361
1362 ret = exofs_sbi_remove(ios); 1362 ret = ore_remove(ios);
1363 if (ret) { 1363 if (ret) {
1364 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); 1364 EXOFS_ERR("%s: ore_remove failed\n", __func__);
1365 exofs_put_io_state(ios); 1365 ore_put_io_state(ios);
1366 return; 1366 return;
1367 } 1367 }
1368 atomic_inc(&sbi->s_curr_pending); 1368 atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ore.c
index f74a2ec027a6..25305af88198 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ore.c
@@ -23,81 +23,87 @@
23 */ 23 */
24 24
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <scsi/scsi_device.h>
27#include <asm/div64.h> 26#include <asm/div64.h>
28 27
29#include "exofs.h" 28#include <scsi/osd_ore.h>
30 29
31#define EXOFS_DBGMSG2(M...) do {} while (0) 30#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a)
32/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */
33 31
34void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) 32#ifdef CONFIG_EXOFS_DEBUG
35{ 33#define ORE_DBGMSG(fmt, a...) \
36 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); 34 printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a)
37} 35#else
36#define ORE_DBGMSG(fmt, a...) \
37 do { if (0) printk(fmt, ##a); } while (0)
38#endif
38 39
39int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, 40/* u64 has problems with printk this will cast it to unsigned long long */
40 u64 offset, void *p, unsigned length) 41#define _LLU(x) (unsigned long long)(x)
41{
42 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
43/* struct osd_sense_info osi = {.key = 0};*/
44 int ret;
45 42
46 if (unlikely(!or)) { 43#define ORE_DBGMSG2(M...) do {} while (0)
47 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); 44/* #define ORE_DBGMSG2 ORE_DBGMSG */
48 return -ENOMEM;
49 }
50 ret = osd_req_read_kern(or, obj, offset, p, length);
51 if (unlikely(ret)) {
52 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
53 goto out;
54 }
55 45
56 ret = osd_finalize_request(or, 0, cred, NULL); 46MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>");
57 if (unlikely(ret)) { 47MODULE_DESCRIPTION("Objects Raid Engine ore.ko");
58 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); 48MODULE_LICENSE("GPL");
59 goto out;
60 }
61 49
62 ret = osd_execute_request(or); 50static u8 *_ios_cred(struct ore_io_state *ios, unsigned index)
63 if (unlikely(ret)) 51{
64 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); 52 return ios->comps->comps[index & ios->comps->single_comp].cred;
65 /* osd_req_decode_sense(or, ret); */ 53}
66 54
67out: 55static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index)
68 osd_end_request(or); 56{
69 return ret; 57 return &ios->comps->comps[index & ios->comps->single_comp].obj;
70} 58}
71 59
72int exofs_get_io_state(struct exofs_layout *layout, 60static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index)
73 struct exofs_io_state **pios)
74{ 61{
75 struct exofs_io_state *ios; 62 return ios->comps->ods[index];
63}
64
65int ore_get_rw_state(struct ore_layout *layout, struct ore_components *comps,
66 bool is_reading, u64 offset, u64 length,
67 struct ore_io_state **pios)
68{
69 struct ore_io_state *ios;
76 70
77 /*TODO: Maybe use kmem_cach per sbi of size 71 /*TODO: Maybe use kmem_cach per sbi of size
78 * exofs_io_state_size(layout->s_numdevs) 72 * exofs_io_state_size(layout->s_numdevs)
79 */ 73 */
80 ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); 74 ios = kzalloc(ore_io_state_size(comps->numdevs), GFP_KERNEL);
81 if (unlikely(!ios)) { 75 if (unlikely(!ios)) {
82 EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", 76 ORE_DBGMSG("Failed kzalloc bytes=%d\n",
83 exofs_io_state_size(layout->s_numdevs)); 77 ore_io_state_size(comps->numdevs));
84 *pios = NULL; 78 *pios = NULL;
85 return -ENOMEM; 79 return -ENOMEM;
86 } 80 }
87 81
88 ios->layout = layout; 82 ios->layout = layout;
89 ios->obj.partition = layout->s_pid; 83 ios->comps = comps;
84 ios->offset = offset;
85 ios->length = length;
86 ios->reading = is_reading;
87
90 *pios = ios; 88 *pios = ios;
91 return 0; 89 return 0;
92} 90}
91EXPORT_SYMBOL(ore_get_rw_state);
92
93int ore_get_io_state(struct ore_layout *layout, struct ore_components *comps,
94 struct ore_io_state **ios)
95{
96 return ore_get_rw_state(layout, comps, true, 0, 0, ios);
97}
98EXPORT_SYMBOL(ore_get_io_state);
93 99
94void exofs_put_io_state(struct exofs_io_state *ios) 100void ore_put_io_state(struct ore_io_state *ios)
95{ 101{
96 if (ios) { 102 if (ios) {
97 unsigned i; 103 unsigned i;
98 104
99 for (i = 0; i < ios->numdevs; i++) { 105 for (i = 0; i < ios->numdevs; i++) {
100 struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; 106 struct ore_per_dev_state *per_dev = &ios->per_dev[i];
101 107
102 if (per_dev->or) 108 if (per_dev->or)
103 osd_end_request(per_dev->or); 109 osd_end_request(per_dev->or);
@@ -108,31 +114,9 @@ void exofs_put_io_state(struct exofs_io_state *ios)
108 kfree(ios); 114 kfree(ios);
109 } 115 }
110} 116}
117EXPORT_SYMBOL(ore_put_io_state);
111 118
112unsigned exofs_layout_od_id(struct exofs_layout *layout, 119static void _sync_done(struct ore_io_state *ios, void *p)
113 osd_id obj_no, unsigned layout_index)
114{
115/* switch (layout->lay_func) {
116 case LAYOUT_MOVING_WINDOW:
117 {*/
118 unsigned dev_mod = obj_no;
119
120 return (layout_index + dev_mod * layout->mirrors_p1) %
121 layout->s_numdevs;
122/* }
123 case LAYOUT_FUNC_IMPLICT:
124 return layout->devs[layout_index];
125 }*/
126}
127
128static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios,
129 unsigned layout_index)
130{
131 return ios->layout->s_ods[
132 exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)];
133}
134
135static void _sync_done(struct exofs_io_state *ios, void *p)
136{ 120{
137 struct completion *waiting = p; 121 struct completion *waiting = p;
138 122
@@ -141,20 +125,20 @@ static void _sync_done(struct exofs_io_state *ios, void *p)
141 125
142static void _last_io(struct kref *kref) 126static void _last_io(struct kref *kref)
143{ 127{
144 struct exofs_io_state *ios = container_of( 128 struct ore_io_state *ios = container_of(
145 kref, struct exofs_io_state, kref); 129 kref, struct ore_io_state, kref);
146 130
147 ios->done(ios, ios->private); 131 ios->done(ios, ios->private);
148} 132}
149 133
150static void _done_io(struct osd_request *or, void *p) 134static void _done_io(struct osd_request *or, void *p)
151{ 135{
152 struct exofs_io_state *ios = p; 136 struct ore_io_state *ios = p;
153 137
154 kref_put(&ios->kref, _last_io); 138 kref_put(&ios->kref, _last_io);
155} 139}
156 140
157static int exofs_io_execute(struct exofs_io_state *ios) 141static int ore_io_execute(struct ore_io_state *ios)
158{ 142{
159 DECLARE_COMPLETION_ONSTACK(wait); 143 DECLARE_COMPLETION_ONSTACK(wait);
160 bool sync = (ios->done == NULL); 144 bool sync = (ios->done == NULL);
@@ -170,9 +154,9 @@ static int exofs_io_execute(struct exofs_io_state *ios)
170 if (unlikely(!or)) 154 if (unlikely(!or))
171 continue; 155 continue;
172 156
173 ret = osd_finalize_request(or, 0, ios->cred, NULL); 157 ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL);
174 if (unlikely(ret)) { 158 if (unlikely(ret)) {
175 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", 159 ORE_DBGMSG("Failed to osd_finalize_request() => %d\n",
176 ret); 160 ret);
177 return ret; 161 return ret;
178 } 162 }
@@ -194,7 +178,7 @@ static int exofs_io_execute(struct exofs_io_state *ios)
194 178
195 if (sync) { 179 if (sync) {
196 wait_for_completion(&wait); 180 wait_for_completion(&wait);
197 ret = exofs_check_io(ios, NULL); 181 ret = ore_check_io(ios, NULL);
198 } 182 }
199 return ret; 183 return ret;
200} 184}
@@ -214,7 +198,7 @@ static void _clear_bio(struct bio *bio)
214 } 198 }
215} 199}
216 200
217int exofs_check_io(struct exofs_io_state *ios, u64 *resid) 201int ore_check_io(struct ore_io_state *ios, u64 *resid)
218{ 202{
219 enum osd_err_priority acumulated_osd_err = 0; 203 enum osd_err_priority acumulated_osd_err = 0;
220 int acumulated_lin_err = 0; 204 int acumulated_lin_err = 0;
@@ -235,7 +219,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
235 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { 219 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
236 /* start read offset passed endof file */ 220 /* start read offset passed endof file */
237 _clear_bio(ios->per_dev[i].bio); 221 _clear_bio(ios->per_dev[i].bio);
238 EXOFS_DBGMSG("start read offset passed end of file " 222 ORE_DBGMSG("start read offset passed end of file "
239 "offset=0x%llx, length=0x%llx\n", 223 "offset=0x%llx, length=0x%llx\n",
240 _LLU(ios->per_dev[i].offset), 224 _LLU(ios->per_dev[i].offset),
241 _LLU(ios->per_dev[i].length)); 225 _LLU(ios->per_dev[i].length));
@@ -259,6 +243,7 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
259 243
260 return acumulated_lin_err; 244 return acumulated_lin_err;
261} 245}
246EXPORT_SYMBOL(ore_check_io);
262 247
263/* 248/*
264 * L - logical offset into the file 249 * L - logical offset into the file
@@ -305,20 +290,21 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
305struct _striping_info { 290struct _striping_info {
306 u64 obj_offset; 291 u64 obj_offset;
307 u64 group_length; 292 u64 group_length;
293 u64 M; /* for truncate */
308 unsigned dev; 294 unsigned dev;
309 unsigned unit_off; 295 unsigned unit_off;
310}; 296};
311 297
312static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, 298static void _calc_stripe_info(struct ore_layout *layout, u64 file_offset,
313 struct _striping_info *si) 299 struct _striping_info *si)
314{ 300{
315 u32 stripe_unit = ios->layout->stripe_unit; 301 u32 stripe_unit = layout->stripe_unit;
316 u32 group_width = ios->layout->group_width; 302 u32 group_width = layout->group_width;
317 u64 group_depth = ios->layout->group_depth; 303 u64 group_depth = layout->group_depth;
318 304
319 u32 U = stripe_unit * group_width; 305 u32 U = stripe_unit * group_width;
320 u64 T = U * group_depth; 306 u64 T = U * group_depth;
321 u64 S = T * ios->layout->group_count; 307 u64 S = T * layout->group_count;
322 u64 M = div64_u64(file_offset, S); 308 u64 M = div64_u64(file_offset, S);
323 309
324 /* 310 /*
@@ -333,7 +319,7 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
333 319
334 /* "H - (N * U)" is just "H % U" so it's bound to u32 */ 320 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
335 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; 321 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
336 si->dev *= ios->layout->mirrors_p1; 322 si->dev *= layout->mirrors_p1;
337 323
338 div_u64_rem(file_offset, stripe_unit, &si->unit_off); 324 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
339 325
@@ -341,15 +327,16 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
341 (M * group_depth * stripe_unit); 327 (M * group_depth * stripe_unit);
342 328
343 si->group_length = T - H; 329 si->group_length = T - H;
330 si->M = M;
344} 331}
345 332
346static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, 333static int _add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
347 unsigned pgbase, struct exofs_per_dev_state *per_dev, 334 unsigned pgbase, struct ore_per_dev_state *per_dev,
348 int cur_len) 335 int cur_len)
349{ 336{
350 unsigned pg = *cur_pg; 337 unsigned pg = *cur_pg;
351 struct request_queue *q = 338 struct request_queue *q =
352 osd_request_queue(exofs_ios_od(ios, per_dev->dev)); 339 osd_request_queue(_ios_od(ios, per_dev->dev));
353 340
354 per_dev->length += cur_len; 341 per_dev->length += cur_len;
355 342
@@ -361,7 +348,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
361 348
362 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); 349 per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
363 if (unlikely(!per_dev->bio)) { 350 if (unlikely(!per_dev->bio)) {
364 EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", 351 ORE_DBGMSG("Failed to allocate BIO size=%u\n",
365 bio_size); 352 bio_size);
366 return -ENOMEM; 353 return -ENOMEM;
367 } 354 }
@@ -387,7 +374,7 @@ static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg,
387 return 0; 374 return 0;
388} 375}
389 376
390static int _prepare_one_group(struct exofs_io_state *ios, u64 length, 377static int _prepare_one_group(struct ore_io_state *ios, u64 length,
391 struct _striping_info *si) 378 struct _striping_info *si)
392{ 379{
393 unsigned stripe_unit = ios->layout->stripe_unit; 380 unsigned stripe_unit = ios->layout->stripe_unit;
@@ -400,7 +387,7 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
400 int ret = 0; 387 int ret = 0;
401 388
402 while (length) { 389 while (length) {
403 struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; 390 struct ore_per_dev_state *per_dev = &ios->per_dev[dev];
404 unsigned cur_len, page_off = 0; 391 unsigned cur_len, page_off = 0;
405 392
406 if (!per_dev->length) { 393 if (!per_dev->length) {
@@ -443,7 +430,7 @@ out:
443 return ret; 430 return ret;
444} 431}
445 432
446static int _prepare_for_striping(struct exofs_io_state *ios) 433static int _prepare_for_striping(struct ore_io_state *ios)
447{ 434{
448 u64 length = ios->length; 435 u64 length = ios->length;
449 u64 offset = ios->offset; 436 u64 offset = ios->offset;
@@ -452,9 +439,9 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
452 439
453 if (!ios->pages) { 440 if (!ios->pages) {
454 if (ios->kern_buff) { 441 if (ios->kern_buff) {
455 struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; 442 struct ore_per_dev_state *per_dev = &ios->per_dev[0];
456 443
457 _calc_stripe_info(ios, ios->offset, &si); 444 _calc_stripe_info(ios->layout, ios->offset, &si);
458 per_dev->offset = si.obj_offset; 445 per_dev->offset = si.obj_offset;
459 per_dev->dev = si.dev; 446 per_dev->dev = si.dev;
460 447
@@ -468,7 +455,7 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
468 } 455 }
469 456
470 while (length) { 457 while (length) {
471 _calc_stripe_info(ios, offset, &si); 458 _calc_stripe_info(ios->layout, offset, &si);
472 459
473 if (length < si.group_length) 460 if (length < si.group_length)
474 si.group_length = length; 461 si.group_length = length;
@@ -485,57 +472,59 @@ out:
485 return ret; 472 return ret;
486} 473}
487 474
488int exofs_sbi_create(struct exofs_io_state *ios) 475int ore_create(struct ore_io_state *ios)
489{ 476{
490 int i, ret; 477 int i, ret;
491 478
492 for (i = 0; i < ios->layout->s_numdevs; i++) { 479 for (i = 0; i < ios->comps->numdevs; i++) {
493 struct osd_request *or; 480 struct osd_request *or;
494 481
495 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 482 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
496 if (unlikely(!or)) { 483 if (unlikely(!or)) {
497 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 484 ORE_ERR("%s: osd_start_request failed\n", __func__);
498 ret = -ENOMEM; 485 ret = -ENOMEM;
499 goto out; 486 goto out;
500 } 487 }
501 ios->per_dev[i].or = or; 488 ios->per_dev[i].or = or;
502 ios->numdevs++; 489 ios->numdevs++;
503 490
504 osd_req_create_object(or, &ios->obj); 491 osd_req_create_object(or, _ios_obj(ios, i));
505 } 492 }
506 ret = exofs_io_execute(ios); 493 ret = ore_io_execute(ios);
507 494
508out: 495out:
509 return ret; 496 return ret;
510} 497}
498EXPORT_SYMBOL(ore_create);
511 499
512int exofs_sbi_remove(struct exofs_io_state *ios) 500int ore_remove(struct ore_io_state *ios)
513{ 501{
514 int i, ret; 502 int i, ret;
515 503
516 for (i = 0; i < ios->layout->s_numdevs; i++) { 504 for (i = 0; i < ios->comps->numdevs; i++) {
517 struct osd_request *or; 505 struct osd_request *or;
518 506
519 or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); 507 or = osd_start_request(_ios_od(ios, i), GFP_KERNEL);
520 if (unlikely(!or)) { 508 if (unlikely(!or)) {
521 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 509 ORE_ERR("%s: osd_start_request failed\n", __func__);
522 ret = -ENOMEM; 510 ret = -ENOMEM;
523 goto out; 511 goto out;
524 } 512 }
525 ios->per_dev[i].or = or; 513 ios->per_dev[i].or = or;
526 ios->numdevs++; 514 ios->numdevs++;
527 515
528 osd_req_remove_object(or, &ios->obj); 516 osd_req_remove_object(or, _ios_obj(ios, i));
529 } 517 }
530 ret = exofs_io_execute(ios); 518 ret = ore_io_execute(ios);
531 519
532out: 520out:
533 return ret; 521 return ret;
534} 522}
523EXPORT_SYMBOL(ore_remove);
535 524
536static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) 525static int _write_mirror(struct ore_io_state *ios, int cur_comp)
537{ 526{
538 struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; 527 struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp];
539 unsigned dev = ios->per_dev[cur_comp].dev; 528 unsigned dev = ios->per_dev[cur_comp].dev;
540 unsigned last_comp = cur_comp + ios->layout->mirrors_p1; 529 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
541 int ret = 0; 530 int ret = 0;
@@ -544,12 +533,12 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
544 return 0; /* Just an empty slot */ 533 return 0; /* Just an empty slot */
545 534
546 for (; cur_comp < last_comp; ++cur_comp, ++dev) { 535 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
547 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 536 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
548 struct osd_request *or; 537 struct osd_request *or;
549 538
550 or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); 539 or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL);
551 if (unlikely(!or)) { 540 if (unlikely(!or)) {
552 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 541 ORE_ERR("%s: osd_start_request failed\n", __func__);
553 ret = -ENOMEM; 542 ret = -ENOMEM;
554 goto out; 543 goto out;
555 } 544 }
@@ -563,7 +552,7 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
563 bio = bio_kmalloc(GFP_KERNEL, 552 bio = bio_kmalloc(GFP_KERNEL,
564 master_dev->bio->bi_max_vecs); 553 master_dev->bio->bi_max_vecs);
565 if (unlikely(!bio)) { 554 if (unlikely(!bio)) {
566 EXOFS_DBGMSG( 555 ORE_DBGMSG(
567 "Failed to allocate BIO size=%u\n", 556 "Failed to allocate BIO size=%u\n",
568 master_dev->bio->bi_max_vecs); 557 master_dev->bio->bi_max_vecs);
569 ret = -ENOMEM; 558 ret = -ENOMEM;
@@ -582,25 +571,29 @@ static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp)
582 bio->bi_rw |= REQ_WRITE; 571 bio->bi_rw |= REQ_WRITE;
583 } 572 }
584 573
585 osd_req_write(or, &ios->obj, per_dev->offset, bio, 574 osd_req_write(or, _ios_obj(ios, dev), per_dev->offset,
586 per_dev->length); 575 bio, per_dev->length);
587 EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " 576 ORE_DBGMSG("write(0x%llx) offset=0x%llx "
588 "length=0x%llx dev=%d\n", 577 "length=0x%llx dev=%d\n",
589 _LLU(ios->obj.id), _LLU(per_dev->offset), 578 _LLU(_ios_obj(ios, dev)->id),
579 _LLU(per_dev->offset),
590 _LLU(per_dev->length), dev); 580 _LLU(per_dev->length), dev);
591 } else if (ios->kern_buff) { 581 } else if (ios->kern_buff) {
592 ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, 582 ret = osd_req_write_kern(or, _ios_obj(ios, dev),
593 ios->kern_buff, ios->length); 583 per_dev->offset,
584 ios->kern_buff, ios->length);
594 if (unlikely(ret)) 585 if (unlikely(ret))
595 goto out; 586 goto out;
596 EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " 587 ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx "
597 "length=0x%llx dev=%d\n", 588 "length=0x%llx dev=%d\n",
598 _LLU(ios->obj.id), _LLU(per_dev->offset), 589 _LLU(_ios_obj(ios, dev)->id),
590 _LLU(per_dev->offset),
599 _LLU(ios->length), dev); 591 _LLU(ios->length), dev);
600 } else { 592 } else {
601 osd_req_set_attributes(or, &ios->obj); 593 osd_req_set_attributes(or, _ios_obj(ios, dev));
602 EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", 594 ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n",
603 _LLU(ios->obj.id), ios->out_attr_len, dev); 595 _LLU(_ios_obj(ios, dev)->id),
596 ios->out_attr_len, dev);
604 } 597 }
605 598
606 if (ios->out_attr) 599 if (ios->out_attr)
@@ -616,7 +609,7 @@ out:
616 return ret; 609 return ret;
617} 610}
618 611
619int exofs_sbi_write(struct exofs_io_state *ios) 612int ore_write(struct ore_io_state *ios)
620{ 613{
621 int i; 614 int i;
622 int ret; 615 int ret;
@@ -626,52 +619,55 @@ int exofs_sbi_write(struct exofs_io_state *ios)
626 return ret; 619 return ret;
627 620
628 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 621 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
629 ret = _sbi_write_mirror(ios, i); 622 ret = _write_mirror(ios, i);
630 if (unlikely(ret)) 623 if (unlikely(ret))
631 return ret; 624 return ret;
632 } 625 }
633 626
634 ret = exofs_io_execute(ios); 627 ret = ore_io_execute(ios);
635 return ret; 628 return ret;
636} 629}
630EXPORT_SYMBOL(ore_write);
637 631
638static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) 632static int _read_mirror(struct ore_io_state *ios, unsigned cur_comp)
639{ 633{
640 struct osd_request *or; 634 struct osd_request *or;
641 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 635 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
642 unsigned first_dev = (unsigned)ios->obj.id; 636 struct osd_obj_id *obj = _ios_obj(ios, cur_comp);
637 unsigned first_dev = (unsigned)obj->id;
643 638
644 if (ios->pages && !per_dev->length) 639 if (ios->pages && !per_dev->length)
645 return 0; /* Just an empty slot */ 640 return 0; /* Just an empty slot */
646 641
647 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; 642 first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1;
648 or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); 643 or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL);
649 if (unlikely(!or)) { 644 if (unlikely(!or)) {
650 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 645 ORE_ERR("%s: osd_start_request failed\n", __func__);
651 return -ENOMEM; 646 return -ENOMEM;
652 } 647 }
653 per_dev->or = or; 648 per_dev->or = or;
654 649
655 if (ios->pages) { 650 if (ios->pages) {
656 osd_req_read(or, &ios->obj, per_dev->offset, 651 osd_req_read(or, obj, per_dev->offset,
657 per_dev->bio, per_dev->length); 652 per_dev->bio, per_dev->length);
658 EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" 653 ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx"
659 " dev=%d\n", _LLU(ios->obj.id), 654 " dev=%d\n", _LLU(obj->id),
660 _LLU(per_dev->offset), _LLU(per_dev->length), 655 _LLU(per_dev->offset), _LLU(per_dev->length),
661 first_dev); 656 first_dev);
662 } else if (ios->kern_buff) { 657 } else if (ios->kern_buff) {
663 int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, 658 int ret = osd_req_read_kern(or, obj, per_dev->offset,
664 ios->kern_buff, ios->length); 659 ios->kern_buff, ios->length);
665 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " 660 ORE_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
666 "length=0x%llx dev=%d ret=>%d\n", 661 "length=0x%llx dev=%d ret=>%d\n",
667 _LLU(ios->obj.id), _LLU(per_dev->offset), 662 _LLU(obj->id), _LLU(per_dev->offset),
668 _LLU(ios->length), first_dev, ret); 663 _LLU(ios->length), first_dev, ret);
669 if (unlikely(ret)) 664 if (unlikely(ret))
670 return ret; 665 return ret;
671 } else { 666 } else {
672 osd_req_get_attributes(or, &ios->obj); 667 osd_req_get_attributes(or, obj);
673 EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", 668 ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n",
674 _LLU(ios->obj.id), ios->in_attr_len, first_dev); 669 _LLU(obj->id),
670 ios->in_attr_len, first_dev);
675 } 671 }
676 if (ios->out_attr) 672 if (ios->out_attr)
677 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); 673 osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len);
@@ -682,7 +678,7 @@ static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp)
682 return 0; 678 return 0;
683} 679}
684 680
685int exofs_sbi_read(struct exofs_io_state *ios) 681int ore_read(struct ore_io_state *ios)
686{ 682{
687 int i; 683 int i;
688 int ret; 684 int ret;
@@ -692,16 +688,17 @@ int exofs_sbi_read(struct exofs_io_state *ios)
692 return ret; 688 return ret;
693 689
694 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { 690 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
695 ret = _sbi_read_mirror(ios, i); 691 ret = _read_mirror(ios, i);
696 if (unlikely(ret)) 692 if (unlikely(ret))
697 return ret; 693 return ret;
698 } 694 }
699 695
700 ret = exofs_io_execute(ios); 696 ret = ore_io_execute(ios);
701 return ret; 697 return ret;
702} 698}
699EXPORT_SYMBOL(ore_read);
703 700
704int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) 701int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr)
705{ 702{
706 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ 703 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
707 void *iter = NULL; 704 void *iter = NULL;
@@ -721,83 +718,118 @@ int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
721 718
722 return -EIO; 719 return -EIO;
723} 720}
721EXPORT_SYMBOL(extract_attr_from_ios);
724 722
725static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, 723static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp,
726 struct osd_attr *attr) 724 struct osd_attr *attr)
727{ 725{
728 int last_comp = cur_comp + ios->layout->mirrors_p1; 726 int last_comp = cur_comp + ios->layout->mirrors_p1;
729 727
730 for (; cur_comp < last_comp; ++cur_comp) { 728 for (; cur_comp < last_comp; ++cur_comp) {
731 struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; 729 struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp];
732 struct osd_request *or; 730 struct osd_request *or;
733 731
734 or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); 732 or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL);
735 if (unlikely(!or)) { 733 if (unlikely(!or)) {
736 EXOFS_ERR("%s: osd_start_request failed\n", __func__); 734 ORE_ERR("%s: osd_start_request failed\n", __func__);
737 return -ENOMEM; 735 return -ENOMEM;
738 } 736 }
739 per_dev->or = or; 737 per_dev->or = or;
740 738
741 osd_req_set_attributes(or, &ios->obj); 739 osd_req_set_attributes(or, _ios_obj(ios, cur_comp));
742 osd_req_add_set_attr_list(or, attr, 1); 740 osd_req_add_set_attr_list(or, attr, 1);
743 } 741 }
744 742
745 return 0; 743 return 0;
746} 744}
747 745
748int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) 746struct _trunc_info {
747 struct _striping_info si;
748 u64 prev_group_obj_off;
749 u64 next_group_obj_off;
750
751 unsigned first_group_dev;
752 unsigned nex_group_dev;
753 unsigned max_devs;
754};
755
756void _calc_trunk_info(struct ore_layout *layout, u64 file_offset,
757 struct _trunc_info *ti)
758{
759 unsigned stripe_unit = layout->stripe_unit;
760
761 _calc_stripe_info(layout, file_offset, &ti->si);
762
763 ti->prev_group_obj_off = ti->si.M * stripe_unit;
764 ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0;
765
766 ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width);
767 ti->nex_group_dev = ti->first_group_dev + layout->group_width;
768 ti->max_devs = layout->group_width * layout->group_count;
769}
770
771int ore_truncate(struct ore_layout *layout, struct ore_components *comps,
772 u64 size)
749{ 773{
750 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; 774 struct ore_io_state *ios;
751 struct exofs_io_state *ios;
752 struct exofs_trunc_attr { 775 struct exofs_trunc_attr {
753 struct osd_attr attr; 776 struct osd_attr attr;
754 __be64 newsize; 777 __be64 newsize;
755 } *size_attrs; 778 } *size_attrs;
756 struct _striping_info si; 779 struct _trunc_info ti;
757 int i, ret; 780 int i, ret;
758 781
759 ret = exofs_get_io_state(&sbi->layout, &ios); 782 ret = ore_get_io_state(layout, comps, &ios);
760 if (unlikely(ret)) 783 if (unlikely(ret))
761 return ret; 784 return ret;
762 785
763 size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), 786 _calc_trunk_info(ios->layout, size, &ti);
787
788 size_attrs = kcalloc(ti.max_devs, sizeof(*size_attrs),
764 GFP_KERNEL); 789 GFP_KERNEL);
765 if (unlikely(!size_attrs)) { 790 if (unlikely(!size_attrs)) {
766 ret = -ENOMEM; 791 ret = -ENOMEM;
767 goto out; 792 goto out;
768 } 793 }
769 794
770 ios->obj.id = exofs_oi_objno(oi); 795 ios->numdevs = ios->comps->numdevs;
771 ios->cred = oi->i_cred;
772 796
773 ios->numdevs = ios->layout->s_numdevs; 797 for (i = 0; i < ti.max_devs; ++i) {
774 _calc_stripe_info(ios, size, &si);
775
776 for (i = 0; i < ios->layout->group_width; ++i) {
777 struct exofs_trunc_attr *size_attr = &size_attrs[i]; 798 struct exofs_trunc_attr *size_attr = &size_attrs[i];
778 u64 obj_size; 799 u64 obj_size;
779 800
780 if (i < si.dev) 801 if (i < ti.first_group_dev)
781 obj_size = si.obj_offset + 802 obj_size = ti.prev_group_obj_off;
782 ios->layout->stripe_unit - si.unit_off; 803 else if (i >= ti.nex_group_dev)
783 else if (i == si.dev) 804 obj_size = ti.next_group_obj_off;
784 obj_size = si.obj_offset; 805 else if (i < ti.si.dev) /* dev within this group */
785 else /* i > si.dev */ 806 obj_size = ti.si.obj_offset +
786 obj_size = si.obj_offset - si.unit_off; 807 ios->layout->stripe_unit - ti.si.unit_off;
808 else if (i == ti.si.dev)
809 obj_size = ti.si.obj_offset;
810 else /* i > ti.dev */
811 obj_size = ti.si.obj_offset - ti.si.unit_off;
787 812
788 size_attr->newsize = cpu_to_be64(obj_size); 813 size_attr->newsize = cpu_to_be64(obj_size);
789 size_attr->attr = g_attr_logical_length; 814 size_attr->attr = g_attr_logical_length;
790 size_attr->attr.val_ptr = &size_attr->newsize; 815 size_attr->attr.val_ptr = &size_attr->newsize;
791 816
817 ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
818 _LLU(comps->comps->obj.id), _LLU(obj_size), i);
792 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, 819 ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
793 &size_attr->attr); 820 &size_attr->attr);
794 if (unlikely(ret)) 821 if (unlikely(ret))
795 goto out; 822 goto out;
796 } 823 }
797 ret = exofs_io_execute(ios); 824 ret = ore_io_execute(ios);
798 825
799out: 826out:
800 kfree(size_attrs); 827 kfree(size_attrs);
801 exofs_put_io_state(ios); 828 ore_put_io_state(ios);
802 return ret; 829 return ret;
803} 830}
831EXPORT_SYMBOL(ore_truncate);
832
833const struct osd_attr g_attr_logical_length = ATTR_DEF(
834 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
835EXPORT_SYMBOL(g_attr_logical_length);
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
deleted file mode 100644
index c52e9888b8ab..000000000000
--- a/fs/exofs/pnfs.h
+++ /dev/null
@@ -1,45 +0,0 @@
1/*
2 * Copyright (C) 2008, 2009
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of exofs.
6 *
7 * exofs is free software; you can redistribute it and/or modify it under the
8 * terms of the GNU General Public License version 2 as published by the Free
9 * Software Foundation.
10 *
11 */
12
13/* FIXME: Remove this file once pnfs hits mainline */
14
15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__
17
18#if ! defined(__PNFS_OSD_XDR_H__)
19
20enum pnfs_iomode {
21 IOMODE_READ = 1,
22 IOMODE_RW = 2,
23 IOMODE_ANY = 3,
24};
25
26/* Layout Structure */
27enum pnfs_osd_raid_algorithm4 {
28 PNFS_OSD_RAID_0 = 1,
29 PNFS_OSD_RAID_4 = 2,
30 PNFS_OSD_RAID_5 = 3,
31 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
32};
33
34struct pnfs_osd_data_map {
35 u32 odm_num_comps;
36 u64 odm_stripe_unit;
37 u32 odm_group_width;
38 u32 odm_group_depth;
39 u32 odm_mirror_cnt;
40 u32 odm_raid_algorithm;
41};
42
43#endif /* ! defined(__PNFS_OSD_XDR_H__) */
44
45#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index c57beddcc217..274894053b02 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -40,6 +40,8 @@
40 40
41#include "exofs.h" 41#include "exofs.h"
42 42
43#define EXOFS_DBGMSG2(M...) do {} while (0)
44
43/****************************************************************************** 45/******************************************************************************
44 * MOUNT OPTIONS 46 * MOUNT OPTIONS
45 *****************************************************************************/ 47 *****************************************************************************/
@@ -208,10 +210,48 @@ static void destroy_inodecache(void)
208} 210}
209 211
210/****************************************************************************** 212/******************************************************************************
211 * SUPERBLOCK FUNCTIONS 213 * Some osd helpers
212 *****************************************************************************/ 214 *****************************************************************************/
213static const struct super_operations exofs_sops; 215void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
214static const struct export_operations exofs_export_ops; 216{
217 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
218}
219
220static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
221 u64 offset, void *p, unsigned length)
222{
223 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
224/* struct osd_sense_info osi = {.key = 0};*/
225 int ret;
226
227 if (unlikely(!or)) {
228 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
229 return -ENOMEM;
230 }
231 ret = osd_req_read_kern(or, obj, offset, p, length);
232 if (unlikely(ret)) {
233 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
234 goto out;
235 }
236
237 ret = osd_finalize_request(or, 0, cred, NULL);
238 if (unlikely(ret)) {
239 EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret);
240 goto out;
241 }
242
243 ret = osd_execute_request(or);
244 if (unlikely(ret))
245 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
246 /* osd_req_decode_sense(or, ret); */
247
248out:
249 osd_end_request(or);
250 EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx "
251 "length=0x%llx dev=%p ret=>%d\n",
252 _LLU(obj->id), _LLU(offset), _LLU(length), od, ret);
253 return ret;
254}
215 255
216static const struct osd_attr g_attr_sb_stats = ATTR_DEF( 256static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
217 EXOFS_APAGE_SB_DATA, 257 EXOFS_APAGE_SB_DATA,
@@ -223,21 +263,19 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
223 struct osd_attr attrs[] = { 263 struct osd_attr attrs[] = {
224 [0] = g_attr_sb_stats, 264 [0] = g_attr_sb_stats,
225 }; 265 };
226 struct exofs_io_state *ios; 266 struct ore_io_state *ios;
227 int ret; 267 int ret;
228 268
229 ret = exofs_get_io_state(&sbi->layout, &ios); 269 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
230 if (unlikely(ret)) { 270 if (unlikely(ret)) {
231 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 271 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
232 return ret; 272 return ret;
233 } 273 }
234 274
235 ios->cred = sbi->s_cred;
236
237 ios->in_attr = attrs; 275 ios->in_attr = attrs;
238 ios->in_attr_len = ARRAY_SIZE(attrs); 276 ios->in_attr_len = ARRAY_SIZE(attrs);
239 277
240 ret = exofs_sbi_read(ios); 278 ret = ore_read(ios);
241 if (unlikely(ret)) { 279 if (unlikely(ret)) {
242 EXOFS_ERR("Error reading super_block stats => %d\n", ret); 280 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
243 goto out; 281 goto out;
@@ -264,13 +302,13 @@ static int __sbi_read_stats(struct exofs_sb_info *sbi)
264 } 302 }
265 303
266out: 304out:
267 exofs_put_io_state(ios); 305 ore_put_io_state(ios);
268 return ret; 306 return ret;
269} 307}
270 308
271static void stats_done(struct exofs_io_state *ios, void *p) 309static void stats_done(struct ore_io_state *ios, void *p)
272{ 310{
273 exofs_put_io_state(ios); 311 ore_put_io_state(ios);
274 /* Good thanks nothing to do anymore */ 312 /* Good thanks nothing to do anymore */
275} 313}
276 314
@@ -280,12 +318,12 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
280 struct osd_attr attrs[] = { 318 struct osd_attr attrs[] = {
281 [0] = g_attr_sb_stats, 319 [0] = g_attr_sb_stats,
282 }; 320 };
283 struct exofs_io_state *ios; 321 struct ore_io_state *ios;
284 int ret; 322 int ret;
285 323
286 ret = exofs_get_io_state(&sbi->layout, &ios); 324 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
287 if (unlikely(ret)) { 325 if (unlikely(ret)) {
288 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); 326 EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__);
289 return ret; 327 return ret;
290 } 328 }
291 329
@@ -293,21 +331,27 @@ int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
293 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); 331 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
294 attrs[0].val_ptr = &sbi->s_ess; 332 attrs[0].val_ptr = &sbi->s_ess;
295 333
296 ios->cred = sbi->s_cred; 334
297 ios->done = stats_done; 335 ios->done = stats_done;
298 ios->private = sbi; 336 ios->private = sbi;
299 ios->out_attr = attrs; 337 ios->out_attr = attrs;
300 ios->out_attr_len = ARRAY_SIZE(attrs); 338 ios->out_attr_len = ARRAY_SIZE(attrs);
301 339
302 ret = exofs_sbi_write(ios); 340 ret = ore_write(ios);
303 if (unlikely(ret)) { 341 if (unlikely(ret)) {
304 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 342 EXOFS_ERR("%s: ore_write failed.\n", __func__);
305 exofs_put_io_state(ios); 343 ore_put_io_state(ios);
306 } 344 }
307 345
308 return ret; 346 return ret;
309} 347}
310 348
349/******************************************************************************
350 * SUPERBLOCK FUNCTIONS
351 *****************************************************************************/
352static const struct super_operations exofs_sops;
353static const struct export_operations exofs_export_ops;
354
311/* 355/*
312 * Write the superblock to the OSD 356 * Write the superblock to the OSD
313 */ 357 */
@@ -315,7 +359,9 @@ int exofs_sync_fs(struct super_block *sb, int wait)
315{ 359{
316 struct exofs_sb_info *sbi; 360 struct exofs_sb_info *sbi;
317 struct exofs_fscb *fscb; 361 struct exofs_fscb *fscb;
318 struct exofs_io_state *ios; 362 struct ore_comp one_comp;
363 struct ore_components comps;
364 struct ore_io_state *ios;
319 int ret = -ENOMEM; 365 int ret = -ENOMEM;
320 366
321 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); 367 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
@@ -331,7 +377,10 @@ int exofs_sync_fs(struct super_block *sb, int wait)
331 * version). Otherwise the exofs_fscb is read-only from mkfs time. All 377 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
332 * the writeable info is set in exofs_sbi_write_stats() above. 378 * the writeable info is set in exofs_sbi_write_stats() above.
333 */ 379 */
334 ret = exofs_get_io_state(&sbi->layout, &ios); 380
381 exofs_init_comps(&comps, &one_comp, sbi, EXOFS_SUPER_ID);
382
383 ret = ore_get_io_state(&sbi->layout, &comps, &ios);
335 if (unlikely(ret)) 384 if (unlikely(ret))
336 goto out; 385 goto out;
337 386
@@ -345,14 +394,12 @@ int exofs_sync_fs(struct super_block *sb, int wait)
345 fscb->s_newfs = 0; 394 fscb->s_newfs = 0;
346 fscb->s_version = EXOFS_FSCB_VER; 395 fscb->s_version = EXOFS_FSCB_VER;
347 396
348 ios->obj.id = EXOFS_SUPER_ID;
349 ios->offset = 0; 397 ios->offset = 0;
350 ios->kern_buff = fscb; 398 ios->kern_buff = fscb;
351 ios->cred = sbi->s_cred;
352 399
353 ret = exofs_sbi_write(ios); 400 ret = ore_write(ios);
354 if (unlikely(ret)) 401 if (unlikely(ret))
355 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 402 EXOFS_ERR("%s: ore_write failed.\n", __func__);
356 else 403 else
357 sb->s_dirt = 0; 404 sb->s_dirt = 0;
358 405
@@ -360,7 +407,7 @@ int exofs_sync_fs(struct super_block *sb, int wait)
360 unlock_super(sb); 407 unlock_super(sb);
361out: 408out:
362 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 409 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
363 exofs_put_io_state(ios); 410 ore_put_io_state(ios);
364 kfree(fscb); 411 kfree(fscb);
365 return ret; 412 return ret;
366} 413}
@@ -384,15 +431,17 @@ static void _exofs_print_device(const char *msg, const char *dev_path,
384 431
385void exofs_free_sbi(struct exofs_sb_info *sbi) 432void exofs_free_sbi(struct exofs_sb_info *sbi)
386{ 433{
387 while (sbi->layout.s_numdevs) { 434 while (sbi->comps.numdevs) {
388 int i = --sbi->layout.s_numdevs; 435 int i = --sbi->comps.numdevs;
389 struct osd_dev *od = sbi->layout.s_ods[i]; 436 struct osd_dev *od = sbi->comps.ods[i];
390 437
391 if (od) { 438 if (od) {
392 sbi->layout.s_ods[i] = NULL; 439 sbi->comps.ods[i] = NULL;
393 osduld_put_device(od); 440 osduld_put_device(od);
394 } 441 }
395 } 442 }
443 if (sbi->comps.ods != sbi->_min_one_dev)
444 kfree(sbi->comps.ods);
396 kfree(sbi); 445 kfree(sbi);
397} 446}
398 447
@@ -419,8 +468,8 @@ static void exofs_put_super(struct super_block *sb)
419 msecs_to_jiffies(100)); 468 msecs_to_jiffies(100));
420 } 469 }
421 470
422 _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], 471 _exofs_print_device("Unmounting", NULL, sbi->comps.ods[0],
423 sbi->layout.s_pid); 472 sbi->one_comp.obj.partition);
424 473
425 bdi_destroy(&sbi->bdi); 474 bdi_destroy(&sbi->bdi);
426 exofs_free_sbi(sbi); 475 exofs_free_sbi(sbi);
@@ -501,10 +550,19 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
501 return -EINVAL; 550 return -EINVAL;
502 } 551 }
503 552
553 EXOFS_DBGMSG("exofs: layout: "
554 "num_comps=%u stripe_unit=0x%x group_width=%u "
555 "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n",
556 numdevs,
557 sbi->layout.stripe_unit,
558 sbi->layout.group_width,
559 _LLU(sbi->layout.group_depth),
560 sbi->layout.mirrors_p1,
561 sbi->data_map.odm_raid_algorithm);
504 return 0; 562 return 0;
505} 563}
506 564
507static unsigned __ra_pages(struct exofs_layout *layout) 565static unsigned __ra_pages(struct ore_layout *layout)
508{ 566{
509 const unsigned _MIN_RA = 32; /* min 128K read-ahead */ 567 const unsigned _MIN_RA = 32; /* min 128K read-ahead */
510 unsigned ra_pages = layout->group_width * layout->stripe_unit / 568 unsigned ra_pages = layout->group_width * layout->stripe_unit /
@@ -547,13 +605,11 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
547 return !(odi->systemid_len || odi->osdname_len); 605 return !(odi->systemid_len || odi->osdname_len);
548} 606}
549 607
550static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, 608static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi,
609 struct osd_dev *fscb_od,
551 unsigned table_count) 610 unsigned table_count)
552{ 611{
553 struct exofs_sb_info *sbi = *psbi; 612 struct ore_comp comp;
554 struct osd_dev *fscb_od;
555 struct osd_obj_id obj = {.partition = sbi->layout.s_pid,
556 .id = EXOFS_DEVTABLE_ID};
557 struct exofs_device_table *dt; 613 struct exofs_device_table *dt;
558 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + 614 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
559 sizeof(*dt); 615 sizeof(*dt);
@@ -567,10 +623,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
567 return -ENOMEM; 623 return -ENOMEM;
568 } 624 }
569 625
570 fscb_od = sbi->layout.s_ods[0]; 626 sbi->comps.numdevs = 0;
571 sbi->layout.s_ods[0] = NULL; 627
572 sbi->layout.s_numdevs = 0; 628 comp.obj.partition = sbi->one_comp.obj.partition;
573 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); 629 comp.obj.id = EXOFS_DEVTABLE_ID;
630 exofs_make_credential(comp.cred, &comp.obj);
631
632 ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt,
633 table_bytes);
574 if (unlikely(ret)) { 634 if (unlikely(ret)) {
575 EXOFS_ERR("ERROR: reading device table\n"); 635 EXOFS_ERR("ERROR: reading device table\n");
576 goto out; 636 goto out;
@@ -588,16 +648,18 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
588 goto out; 648 goto out;
589 649
590 if (likely(numdevs > 1)) { 650 if (likely(numdevs > 1)) {
591 unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); 651 unsigned size = numdevs * sizeof(sbi->comps.ods[0]);
592 652
593 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); 653 /* Twice bigger table: See exofs_init_comps() and below
594 if (unlikely(!sbi)) { 654 * comment
655 */
656 sbi->comps.ods = kzalloc(size + size - 1, GFP_KERNEL);
657 if (unlikely(!sbi->comps.ods)) {
658 EXOFS_ERR("ERROR: faild allocating Device array[%d]\n",
659 numdevs);
595 ret = -ENOMEM; 660 ret = -ENOMEM;
596 goto out; 661 goto out;
597 } 662 }
598 memset(&sbi->layout.s_ods[1], 0,
599 size - sizeof(sbi->layout.s_ods[0]));
600 *psbi = sbi;
601 } 663 }
602 664
603 for (i = 0; i < numdevs; i++) { 665 for (i = 0; i < numdevs; i++) {
@@ -619,8 +681,8 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
619 * line. We always keep them in device-table order. 681 * line. We always keep them in device-table order.
620 */ 682 */
621 if (fscb_od && osduld_device_same(fscb_od, &odi)) { 683 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
622 sbi->layout.s_ods[i] = fscb_od; 684 sbi->comps.ods[i] = fscb_od;
623 ++sbi->layout.s_numdevs; 685 ++sbi->comps.numdevs;
624 fscb_od = NULL; 686 fscb_od = NULL;
625 continue; 687 continue;
626 } 688 }
@@ -633,13 +695,13 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
633 goto out; 695 goto out;
634 } 696 }
635 697
636 sbi->layout.s_ods[i] = od; 698 sbi->comps.ods[i] = od;
637 ++sbi->layout.s_numdevs; 699 ++sbi->comps.numdevs;
638 700
639 /* Read the fscb of the other devices to make sure the FS 701 /* Read the fscb of the other devices to make sure the FS
640 * partition is there. 702 * partition is there.
641 */ 703 */
642 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, 704 ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb,
643 sizeof(fscb)); 705 sizeof(fscb));
644 if (unlikely(ret)) { 706 if (unlikely(ret)) {
645 EXOFS_ERR("ERROR: Malformed participating device " 707 EXOFS_ERR("ERROR: Malformed participating device "
@@ -656,13 +718,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
656 718
657out: 719out:
658 kfree(dt); 720 kfree(dt);
659 if (unlikely(!ret && fscb_od)) { 721 if (likely(!ret)) {
660 EXOFS_ERR( 722 unsigned numdevs = sbi->comps.numdevs;
661 "ERROR: Bad device-table container device not present\n");
662 osduld_put_device(fscb_od);
663 ret = -EINVAL;
664 }
665 723
724 if (unlikely(fscb_od)) {
725 EXOFS_ERR("ERROR: Bad device-table container device not present\n");
726 osduld_put_device(fscb_od);
727 return -EINVAL;
728 }
729 /* exofs round-robins the device table view according to inode
730 * number. We hold a: twice bigger table hence inodes can point
731 * to any device and have a sequential view of the table
732 * starting at this device. See exofs_init_comps()
733 */
734 for (i = 0; i < numdevs - 1; ++i)
735 sbi->comps.ods[i + numdevs] = sbi->comps.ods[i];
736 }
666 return ret; 737 return ret;
667} 738}
668 739
@@ -676,7 +747,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
676 struct exofs_sb_info *sbi; /*extended info */ 747 struct exofs_sb_info *sbi; /*extended info */
677 struct osd_dev *od; /* Master device */ 748 struct osd_dev *od; /* Master device */
678 struct exofs_fscb fscb; /*on-disk superblock info */ 749 struct exofs_fscb fscb; /*on-disk superblock info */
679 struct osd_obj_id obj; 750 struct ore_comp comp;
680 unsigned table_count; 751 unsigned table_count;
681 int ret; 752 int ret;
682 753
@@ -684,10 +755,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
684 if (!sbi) 755 if (!sbi)
685 return -ENOMEM; 756 return -ENOMEM;
686 757
687 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
688 if (ret)
689 goto free_bdi;
690
691 /* use mount options to fill superblock */ 758 /* use mount options to fill superblock */
692 if (opts->is_osdname) { 759 if (opts->is_osdname) {
693 struct osd_dev_info odi = {.systemid_len = 0}; 760 struct osd_dev_info odi = {.systemid_len = 0};
@@ -695,6 +762,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
695 odi.osdname_len = strlen(opts->dev_name); 762 odi.osdname_len = strlen(opts->dev_name);
696 odi.osdname = (u8 *)opts->dev_name; 763 odi.osdname = (u8 *)opts->dev_name;
697 od = osduld_info_lookup(&odi); 764 od = osduld_info_lookup(&odi);
765 kfree(opts->dev_name);
766 opts->dev_name = NULL;
698 } else { 767 } else {
699 od = osduld_path_lookup(opts->dev_name); 768 od = osduld_path_lookup(opts->dev_name);
700 } 769 }
@@ -709,11 +778,16 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
709 sbi->layout.group_width = 1; 778 sbi->layout.group_width = 1;
710 sbi->layout.group_depth = -1; 779 sbi->layout.group_depth = -1;
711 sbi->layout.group_count = 1; 780 sbi->layout.group_count = 1;
712 sbi->layout.s_ods[0] = od;
713 sbi->layout.s_numdevs = 1;
714 sbi->layout.s_pid = opts->pid;
715 sbi->s_timeout = opts->timeout; 781 sbi->s_timeout = opts->timeout;
716 782
783 sbi->one_comp.obj.partition = opts->pid;
784 sbi->one_comp.obj.id = 0;
785 exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj);
786 sbi->comps.numdevs = 1;
787 sbi->comps.single_comp = EC_SINGLE_COMP;
788 sbi->comps.comps = &sbi->one_comp;
789 sbi->comps.ods = sbi->_min_one_dev;
790
717 /* fill in some other data by hand */ 791 /* fill in some other data by hand */
718 memset(sb->s_id, 0, sizeof(sb->s_id)); 792 memset(sb->s_id, 0, sizeof(sb->s_id));
719 strcpy(sb->s_id, "exofs"); 793 strcpy(sb->s_id, "exofs");
@@ -724,11 +798,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
724 sb->s_bdev = NULL; 798 sb->s_bdev = NULL;
725 sb->s_dev = 0; 799 sb->s_dev = 0;
726 800
727 obj.partition = sbi->layout.s_pid; 801 comp.obj.partition = sbi->one_comp.obj.partition;
728 obj.id = EXOFS_SUPER_ID; 802 comp.obj.id = EXOFS_SUPER_ID;
729 exofs_make_credential(sbi->s_cred, &obj); 803 exofs_make_credential(comp.cred, &comp.obj);
730 804
731 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); 805 ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb));
732 if (unlikely(ret)) 806 if (unlikely(ret))
733 goto free_sbi; 807 goto free_sbi;
734 808
@@ -757,9 +831,11 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
757 831
758 table_count = le64_to_cpu(fscb.s_dev_table_count); 832 table_count = le64_to_cpu(fscb.s_dev_table_count);
759 if (table_count) { 833 if (table_count) {
760 ret = exofs_read_lookup_dev_table(&sbi, table_count); 834 ret = exofs_read_lookup_dev_table(sbi, od, table_count);
761 if (unlikely(ret)) 835 if (unlikely(ret))
762 goto free_sbi; 836 goto free_sbi;
837 } else {
838 sbi->comps.ods[0] = od;
763 } 839 }
764 840
765 __sbi_read_stats(sbi); 841 __sbi_read_stats(sbi);
@@ -793,20 +869,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
793 goto free_sbi; 869 goto free_sbi;
794 } 870 }
795 871
796 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], 872 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
797 sbi->layout.s_pid); 873 if (ret) {
798 if (opts->is_osdname) 874 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
799 kfree(opts->dev_name); 875 goto free_sbi;
876 }
877
878 _exofs_print_device("Mounting", opts->dev_name, sbi->comps.ods[0],
879 sbi->one_comp.obj.partition);
800 return 0; 880 return 0;
801 881
802free_sbi: 882free_sbi:
803 bdi_destroy(&sbi->bdi);
804free_bdi:
805 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 883 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
806 opts->dev_name, sbi->layout.s_pid, ret); 884 opts->dev_name, sbi->one_comp.obj.partition, ret);
807 exofs_free_sbi(sbi); 885 exofs_free_sbi(sbi);
808 if (opts->is_osdname)
809 kfree(opts->dev_name);
810 return ret; 886 return ret;
811} 887}
812 888
@@ -837,7 +913,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
837{ 913{
838 struct super_block *sb = dentry->d_sb; 914 struct super_block *sb = dentry->d_sb;
839 struct exofs_sb_info *sbi = sb->s_fs_info; 915 struct exofs_sb_info *sbi = sb->s_fs_info;
840 struct exofs_io_state *ios; 916 struct ore_io_state *ios;
841 struct osd_attr attrs[] = { 917 struct osd_attr attrs[] = {
842 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, 918 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
843 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), 919 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -846,21 +922,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
846 }; 922 };
847 uint64_t capacity = ULLONG_MAX; 923 uint64_t capacity = ULLONG_MAX;
848 uint64_t used = ULLONG_MAX; 924 uint64_t used = ULLONG_MAX;
849 uint8_t cred_a[OSD_CAP_LEN];
850 int ret; 925 int ret;
851 926
852 ret = exofs_get_io_state(&sbi->layout, &ios); 927 ret = ore_get_io_state(&sbi->layout, &sbi->comps, &ios);
853 if (ret) { 928 if (ret) {
854 EXOFS_DBGMSG("exofs_get_io_state failed.\n"); 929 EXOFS_DBGMSG("ore_get_io_state failed.\n");
855 return ret; 930 return ret;
856 } 931 }
857 932
858 exofs_make_credential(cred_a, &ios->obj);
859 ios->cred = sbi->s_cred;
860 ios->in_attr = attrs; 933 ios->in_attr = attrs;
861 ios->in_attr_len = ARRAY_SIZE(attrs); 934 ios->in_attr_len = ARRAY_SIZE(attrs);
862 935
863 ret = exofs_sbi_read(ios); 936 ret = ore_read(ios);
864 if (unlikely(ret)) 937 if (unlikely(ret))
865 goto out; 938 goto out;
866 939
@@ -889,7 +962,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
889 buf->f_namelen = EXOFS_NAME_LEN; 962 buf->f_namelen = EXOFS_NAME_LEN;
890 963
891out: 964out:
892 exofs_put_io_state(ios); 965 ore_put_io_state(ios);
893 return ret; 966 return ret;
894} 967}
895 968
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 52c053763942..35d6a3cfd9ff 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -194,12 +194,10 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
194 case ACL_TYPE_ACCESS: 194 case ACL_TYPE_ACCESS:
195 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; 195 name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
196 if (acl) { 196 if (acl) {
197 mode_t mode = inode->i_mode; 197 error = posix_acl_equiv_mode(acl, &inode->i_mode);
198 error = posix_acl_equiv_mode(acl, &mode);
199 if (error < 0) 198 if (error < 0)
200 return error; 199 return error;
201 else { 200 else {
202 inode->i_mode = mode;
203 inode->i_ctime = CURRENT_TIME_SEC; 201 inode->i_ctime = CURRENT_TIME_SEC;
204 mark_inode_dirty(inode); 202 mark_inode_dirty(inode);
205 if (error == 0) 203 if (error == 0)
@@ -253,16 +251,14 @@ ext2_init_acl(struct inode *inode, struct inode *dir)
253 inode->i_mode &= ~current_umask(); 251 inode->i_mode &= ~current_umask();
254 } 252 }
255 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 253 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
256 mode_t mode = inode->i_mode;
257 if (S_ISDIR(inode->i_mode)) { 254 if (S_ISDIR(inode->i_mode)) {
258 error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl); 255 error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
259 if (error) 256 if (error)
260 goto cleanup; 257 goto cleanup;
261 } 258 }
262 error = posix_acl_create(&acl, GFP_KERNEL, &mode); 259 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
263 if (error < 0) 260 if (error < 0)
264 return error; 261 return error;
265 inode->i_mode = mode;
266 if (error > 0) { 262 if (error > 0) {
267 /* This is an extended ACL */ 263 /* This is an extended ACL */
268 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl); 264 error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 6c29bf0df04a..3091f62e55b6 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -199,12 +199,10 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
199 case ACL_TYPE_ACCESS: 199 case ACL_TYPE_ACCESS:
200 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS; 200 name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
201 if (acl) { 201 if (acl) {
202 mode_t mode = inode->i_mode; 202 error = posix_acl_equiv_mode(acl, &inode->i_mode);
203 error = posix_acl_equiv_mode(acl, &mode);
204 if (error < 0) 203 if (error < 0)
205 return error; 204 return error;
206 else { 205 else {
207 inode->i_mode = mode;
208 inode->i_ctime = CURRENT_TIME_SEC; 206 inode->i_ctime = CURRENT_TIME_SEC;
209 ext3_mark_inode_dirty(handle, inode); 207 ext3_mark_inode_dirty(handle, inode);
210 if (error == 0) 208 if (error == 0)
@@ -261,19 +259,16 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
261 inode->i_mode &= ~current_umask(); 259 inode->i_mode &= ~current_umask();
262 } 260 }
263 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 261 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
264 mode_t mode = inode->i_mode;
265
266 if (S_ISDIR(inode->i_mode)) { 262 if (S_ISDIR(inode->i_mode)) {
267 error = ext3_set_acl(handle, inode, 263 error = ext3_set_acl(handle, inode,
268 ACL_TYPE_DEFAULT, acl); 264 ACL_TYPE_DEFAULT, acl);
269 if (error) 265 if (error)
270 goto cleanup; 266 goto cleanup;
271 } 267 }
272 error = posix_acl_create(&acl, GFP_NOFS, &mode); 268 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
273 if (error < 0) 269 if (error < 0)
274 return error; 270 return error;
275 271
276 inode->i_mode = mode;
277 if (error > 0) { 272 if (error > 0) {
278 /* This is an extended ACL */ 273 /* This is an extended ACL */
279 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); 274 error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 04109460ba9e..56fd8f865930 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ 7ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o 10 mmp.o indirect.o
11 11
12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o 12ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dca2d1ded931..a5c29bb3b835 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -198,12 +198,10 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
198 case ACL_TYPE_ACCESS: 198 case ACL_TYPE_ACCESS:
199 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; 199 name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
200 if (acl) { 200 if (acl) {
201 mode_t mode = inode->i_mode; 201 error = posix_acl_equiv_mode(acl, &inode->i_mode);
202 error = posix_acl_equiv_mode(acl, &mode);
203 if (error < 0) 202 if (error < 0)
204 return error; 203 return error;
205 else { 204 else {
206 inode->i_mode = mode;
207 inode->i_ctime = ext4_current_time(inode); 205 inode->i_ctime = ext4_current_time(inode);
208 ext4_mark_inode_dirty(handle, inode); 206 ext4_mark_inode_dirty(handle, inode);
209 if (error == 0) 207 if (error == 0)
@@ -259,19 +257,16 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
259 inode->i_mode &= ~current_umask(); 257 inode->i_mode &= ~current_umask();
260 } 258 }
261 if (test_opt(inode->i_sb, POSIX_ACL) && acl) { 259 if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
262 mode_t mode = inode->i_mode;
263
264 if (S_ISDIR(inode->i_mode)) { 260 if (S_ISDIR(inode->i_mode)) {
265 error = ext4_set_acl(handle, inode, 261 error = ext4_set_acl(handle, inode,
266 ACL_TYPE_DEFAULT, acl); 262 ACL_TYPE_DEFAULT, acl);
267 if (error) 263 if (error)
268 goto cleanup; 264 goto cleanup;
269 } 265 }
270 error = posix_acl_create(&acl, GFP_NOFS, &mode); 266 error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
271 if (error < 0) 267 if (error < 0)
272 return error; 268 return error;
273 269
274 inode->i_mode = mode;
275 if (error > 0) { 270 if (error > 0) {
276 /* This is an extended ACL */ 271 /* This is an extended ACL */
277 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl); 272 error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 264f6949511e..f8224adf496e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -620,3 +620,51 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
620 620
621} 621}
622 622
623/**
624 * ext4_inode_to_goal_block - return a hint for block allocation
625 * @inode: inode for block allocation
626 *
627 * Return the ideal location to start allocating blocks for a
628 * newly created inode.
629 */
630ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode)
631{
632 struct ext4_inode_info *ei = EXT4_I(inode);
633 ext4_group_t block_group;
634 ext4_grpblk_t colour;
635 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
636 ext4_fsblk_t bg_start;
637 ext4_fsblk_t last_block;
638
639 block_group = ei->i_block_group;
640 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
641 /*
642 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
643 * block groups per flexgroup, reserve the first block
644 * group for directories and special files. Regular
645 * files will start at the second block group. This
646 * tends to speed up directory access and improves
647 * fsck times.
648 */
649 block_group &= ~(flex_size-1);
650 if (S_ISREG(inode->i_mode))
651 block_group++;
652 }
653 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
654 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
655
656 /*
657 * If we are doing delayed allocation, we don't need take
658 * colour into account.
659 */
660 if (test_opt(inode->i_sb, DELALLOC))
661 return bg_start;
662
663 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
664 colour = (current->pid % 16) *
665 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
666 else
667 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
668 return bg_start + colour;
669}
670
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index fac90f3fba80..8efb2f0a3447 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -246,3 +246,24 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
246 return 1; 246 return 1;
247} 247}
248 248
249int ext4_check_blockref(const char *function, unsigned int line,
250 struct inode *inode, __le32 *p, unsigned int max)
251{
252 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
253 __le32 *bref = p;
254 unsigned int blk;
255
256 while (bref < p+max) {
257 blk = le32_to_cpu(*bref++);
258 if (blk &&
259 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
260 blk, 1))) {
261 es->s_last_error_block = cpu_to_le64(blk);
262 ext4_error_inode(inode, function, line, blk,
263 "invalid block");
264 return -EIO;
265 }
266 }
267 return 0;
268}
269
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fa44df879711..e717dfd2f2b4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -526,6 +526,7 @@ struct ext4_new_group_data {
526#define EXT4_FREE_BLOCKS_METADATA 0x0001 526#define EXT4_FREE_BLOCKS_METADATA 0x0001
527#define EXT4_FREE_BLOCKS_FORGET 0x0002 527#define EXT4_FREE_BLOCKS_FORGET 0x0002
528#define EXT4_FREE_BLOCKS_VALIDATED 0x0004 528#define EXT4_FREE_BLOCKS_VALIDATED 0x0004
529#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
529 530
530/* 531/*
531 * ioctl commands 532 * ioctl commands
@@ -939,6 +940,8 @@ struct ext4_inode_info {
939#define ext4_find_next_zero_bit find_next_zero_bit_le 940#define ext4_find_next_zero_bit find_next_zero_bit_le
940#define ext4_find_next_bit find_next_bit_le 941#define ext4_find_next_bit find_next_bit_le
941 942
943extern void ext4_set_bits(void *bm, int cur, int len);
944
942/* 945/*
943 * Maximal mount counts between two filesystem checks 946 * Maximal mount counts between two filesystem checks
944 */ 947 */
@@ -1126,7 +1129,8 @@ struct ext4_sb_info {
1126 struct journal_s *s_journal; 1129 struct journal_s *s_journal;
1127 struct list_head s_orphan; 1130 struct list_head s_orphan;
1128 struct mutex s_orphan_lock; 1131 struct mutex s_orphan_lock;
1129 struct mutex s_resize_lock; 1132 unsigned long s_resize_flags; /* Flags indicating if there
1133 is a resizer */
1130 unsigned long s_commit_interval; 1134 unsigned long s_commit_interval;
1131 u32 s_max_batch_time; 1135 u32 s_max_batch_time;
1132 u32 s_min_batch_time; 1136 u32 s_min_batch_time;
@@ -1214,6 +1218,9 @@ struct ext4_sb_info {
1214 1218
1215 /* Kernel thread for multiple mount protection */ 1219 /* Kernel thread for multiple mount protection */
1216 struct task_struct *s_mmp_tsk; 1220 struct task_struct *s_mmp_tsk;
1221
1222 /* record the last minlen when FITRIM is called. */
1223 atomic_t s_last_trim_minblks;
1217}; 1224};
1218 1225
1219static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1226static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1743,6 +1750,7 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1743 struct ext4_group_desc *desc); 1750 struct ext4_group_desc *desc);
1744#define ext4_free_blocks_after_init(sb, group, desc) \ 1751#define ext4_free_blocks_after_init(sb, group, desc) \
1745 ext4_init_block_bitmap(sb, NULL, group, desc) 1752 ext4_init_block_bitmap(sb, NULL, group, desc)
1753ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
1746 1754
1747/* dir.c */ 1755/* dir.c */
1748extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 1756extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
@@ -1793,7 +1801,7 @@ extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1793 unsigned long count, int flags); 1801 unsigned long count, int flags);
1794extern int ext4_mb_add_groupinfo(struct super_block *sb, 1802extern int ext4_mb_add_groupinfo(struct super_block *sb,
1795 ext4_group_t i, struct ext4_group_desc *desc); 1803 ext4_group_t i, struct ext4_group_desc *desc);
1796extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1804extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
1797 ext4_fsblk_t block, unsigned long count); 1805 ext4_fsblk_t block, unsigned long count);
1798extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 1806extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
1799 1807
@@ -1834,6 +1842,17 @@ extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1834extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1842extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1835extern void ext4_da_update_reserve_space(struct inode *inode, 1843extern void ext4_da_update_reserve_space(struct inode *inode,
1836 int used, int quota_claim); 1844 int used, int quota_claim);
1845
1846/* indirect.c */
1847extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
1848 struct ext4_map_blocks *map, int flags);
1849extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
1850 const struct iovec *iov, loff_t offset,
1851 unsigned long nr_segs);
1852extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
1853extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk);
1854extern void ext4_ind_truncate(struct inode *inode);
1855
1837/* ioctl.c */ 1856/* ioctl.c */
1838extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 1857extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1839extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); 1858extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
@@ -1855,6 +1874,9 @@ extern int ext4_group_extend(struct super_block *sb,
1855 ext4_fsblk_t n_blocks_count); 1874 ext4_fsblk_t n_blocks_count);
1856 1875
1857/* super.c */ 1876/* super.c */
1877extern void *ext4_kvmalloc(size_t size, gfp_t flags);
1878extern void *ext4_kvzalloc(size_t size, gfp_t flags);
1879extern void ext4_kvfree(void *ptr);
1858extern void __ext4_error(struct super_block *, const char *, unsigned int, 1880extern void __ext4_error(struct super_block *, const char *, unsigned int,
1859 const char *, ...) 1881 const char *, ...)
1860 __attribute__ ((format (printf, 4, 5))); 1882 __attribute__ ((format (printf, 4, 5)));
@@ -2067,11 +2089,19 @@ struct ext4_group_info {
2067 * 5 free 8-block regions. */ 2089 * 5 free 8-block regions. */
2068}; 2090};
2069 2091
2070#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 2092#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
2093#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1
2071 2094
2072#define EXT4_MB_GRP_NEED_INIT(grp) \ 2095#define EXT4_MB_GRP_NEED_INIT(grp) \
2073 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 2096 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
2074 2097
2098#define EXT4_MB_GRP_WAS_TRIMMED(grp) \
2099 (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2100#define EXT4_MB_GRP_SET_TRIMMED(grp) \
2101 (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2102#define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
2103 (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
2104
2075#define EXT4_MAX_CONTENTION 8 2105#define EXT4_MAX_CONTENTION 8
2076#define EXT4_CONTENTION_THRESHOLD 2 2106#define EXT4_CONTENTION_THRESHOLD 2
2077 2107
@@ -2123,6 +2153,19 @@ static inline void ext4_mark_super_dirty(struct super_block *sb)
2123} 2153}
2124 2154
2125/* 2155/*
2156 * Block validity checking
2157 */
2158#define ext4_check_indirect_blockref(inode, bh) \
2159 ext4_check_blockref(__func__, __LINE__, inode, \
2160 (__le32 *)(bh)->b_data, \
2161 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
2162
2163#define ext4_ind_check_inode(inode) \
2164 ext4_check_blockref(__func__, __LINE__, inode, \
2165 EXT4_I(inode)->i_data, \
2166 EXT4_NDIR_BLOCKS)
2167
2168/*
2126 * Inodes and files operations 2169 * Inodes and files operations
2127 */ 2170 */
2128 2171
@@ -2151,6 +2194,8 @@ extern void ext4_exit_system_zone(void);
2151extern int ext4_data_block_valid(struct ext4_sb_info *sbi, 2194extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
2152 ext4_fsblk_t start_blk, 2195 ext4_fsblk_t start_blk,
2153 unsigned int count); 2196 unsigned int count);
2197extern int ext4_check_blockref(const char *, unsigned int,
2198 struct inode *, __le32 *, unsigned int);
2154 2199
2155/* extents.c */ 2200/* extents.c */
2156extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2201extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
@@ -2230,6 +2275,10 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2230extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 2275extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2231extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ]; 2276extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2232 2277
2278#define EXT4_RESIZING 0
2279extern int ext4_resize_begin(struct super_block *sb);
2280extern void ext4_resize_end(struct super_block *sb);
2281
2233#endif /* __KERNEL__ */ 2282#endif /* __KERNEL__ */
2234 2283
2235#endif /* _EXT4_H */ 2284#endif /* _EXT4_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f815cc81e7a2..57cf568a98ab 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -114,12 +114,6 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
114 struct ext4_ext_path *path, 114 struct ext4_ext_path *path,
115 ext4_lblk_t block) 115 ext4_lblk_t block)
116{ 116{
117 struct ext4_inode_info *ei = EXT4_I(inode);
118 ext4_fsblk_t bg_start;
119 ext4_fsblk_t last_block;
120 ext4_grpblk_t colour;
121 ext4_group_t block_group;
122 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
123 int depth; 117 int depth;
124 118
125 if (path) { 119 if (path) {
@@ -161,36 +155,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
161 } 155 }
162 156
163 /* OK. use inode's group */ 157 /* OK. use inode's group */
164 block_group = ei->i_block_group; 158 return ext4_inode_to_goal_block(inode);
165 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
166 /*
167 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
168 * block groups per flexgroup, reserve the first block
169 * group for directories and special files. Regular
170 * files will start at the second block group. This
171 * tends to speed up directory access and improves
172 * fsck times.
173 */
174 block_group &= ~(flex_size-1);
175 if (S_ISREG(inode->i_mode))
176 block_group++;
177 }
178 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
179 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
180
181 /*
182 * If we are doing delayed allocation, we don't need take
183 * colour into account.
184 */
185 if (test_opt(inode->i_sb, DELALLOC))
186 return bg_start;
187
188 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
189 colour = (current->pid % 16) *
190 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
191 else
192 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
193 return bg_start + colour + block;
194} 159}
195 160
196/* 161/*
@@ -776,6 +741,16 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
776 logical, le32_to_cpu(curp->p_idx->ei_block)); 741 logical, le32_to_cpu(curp->p_idx->ei_block));
777 return -EIO; 742 return -EIO;
778 } 743 }
744
745 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
746 >= le16_to_cpu(curp->p_hdr->eh_max))) {
747 EXT4_ERROR_INODE(inode,
748 "eh_entries %d >= eh_max %d!",
749 le16_to_cpu(curp->p_hdr->eh_entries),
750 le16_to_cpu(curp->p_hdr->eh_max));
751 return -EIO;
752 }
753
779 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; 754 len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx;
780 if (logical > le32_to_cpu(curp->p_idx->ei_block)) { 755 if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
781 /* insert after */ 756 /* insert after */
@@ -805,13 +780,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
805 ext4_idx_store_pblock(ix, ptr); 780 ext4_idx_store_pblock(ix, ptr);
806 le16_add_cpu(&curp->p_hdr->eh_entries, 1); 781 le16_add_cpu(&curp->p_hdr->eh_entries, 1);
807 782
808 if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
809 > le16_to_cpu(curp->p_hdr->eh_max))) {
810 EXT4_ERROR_INODE(inode,
811 "logical %d == ei_block %d!",
812 logical, le32_to_cpu(curp->p_idx->ei_block));
813 return -EIO;
814 }
815 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) { 783 if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
816 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!"); 784 EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
817 return -EIO; 785 return -EIO;
@@ -1446,8 +1414,7 @@ ext4_ext_next_allocated_block(struct ext4_ext_path *path)
1446 * ext4_ext_next_leaf_block: 1414 * ext4_ext_next_leaf_block:
1447 * returns first allocated block from next leaf or EXT_MAX_BLOCKS 1415 * returns first allocated block from next leaf or EXT_MAX_BLOCKS
1448 */ 1416 */
1449static ext4_lblk_t ext4_ext_next_leaf_block(struct inode *inode, 1417static ext4_lblk_t ext4_ext_next_leaf_block(struct ext4_ext_path *path)
1450 struct ext4_ext_path *path)
1451{ 1418{
1452 int depth; 1419 int depth;
1453 1420
@@ -1757,7 +1724,6 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1757 goto merge; 1724 goto merge;
1758 } 1725 }
1759 1726
1760repeat:
1761 depth = ext_depth(inode); 1727 depth = ext_depth(inode);
1762 eh = path[depth].p_hdr; 1728 eh = path[depth].p_hdr;
1763 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) 1729 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max))
@@ -1765,9 +1731,10 @@ repeat:
1765 1731
1766 /* probably next leaf has space for us? */ 1732 /* probably next leaf has space for us? */
1767 fex = EXT_LAST_EXTENT(eh); 1733 fex = EXT_LAST_EXTENT(eh);
1768 next = ext4_ext_next_leaf_block(inode, path); 1734 next = EXT_MAX_BLOCKS;
1769 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block) 1735 if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block))
1770 && next != EXT_MAX_BLOCKS) { 1736 next = ext4_ext_next_leaf_block(path);
1737 if (next != EXT_MAX_BLOCKS) {
1771 ext_debug("next leaf block - %d\n", next); 1738 ext_debug("next leaf block - %d\n", next);
1772 BUG_ON(npath != NULL); 1739 BUG_ON(npath != NULL);
1773 npath = ext4_ext_find_extent(inode, next, NULL); 1740 npath = ext4_ext_find_extent(inode, next, NULL);
@@ -1779,7 +1746,7 @@ repeat:
1779 ext_debug("next leaf isn't full(%d)\n", 1746 ext_debug("next leaf isn't full(%d)\n",
1780 le16_to_cpu(eh->eh_entries)); 1747 le16_to_cpu(eh->eh_entries));
1781 path = npath; 1748 path = npath;
1782 goto repeat; 1749 goto has_space;
1783 } 1750 }
1784 ext_debug("next leaf has no free space(%d,%d)\n", 1751 ext_debug("next leaf has no free space(%d,%d)\n",
1785 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); 1752 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -1839,7 +1806,7 @@ has_space:
1839 ext4_ext_pblock(newext), 1806 ext4_ext_pblock(newext),
1840 ext4_ext_is_uninitialized(newext), 1807 ext4_ext_is_uninitialized(newext),
1841 ext4_ext_get_actual_len(newext), 1808 ext4_ext_get_actual_len(newext),
1842 nearex, len, nearex + 1, nearex + 2); 1809 nearex, len, nearex, nearex + 1);
1843 memmove(nearex + 1, nearex, len); 1810 memmove(nearex + 1, nearex, len);
1844 path[depth].p_ext = nearex; 1811 path[depth].p_ext = nearex;
1845 } 1812 }
@@ -2052,7 +2019,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
2052} 2019}
2053 2020
2054/* 2021/*
2055 * ext4_ext_in_cache() 2022 * ext4_ext_check_cache()
2056 * Checks to see if the given block is in the cache. 2023 * Checks to see if the given block is in the cache.
2057 * If it is, the cached extent is stored in the given 2024 * If it is, the cached extent is stored in the given
2058 * cache extent pointer. If the cached extent is a hole, 2025 * cache extent pointer. If the cached extent is a hole,
@@ -2134,8 +2101,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2134/* 2101/*
2135 * ext4_ext_rm_idx: 2102 * ext4_ext_rm_idx:
2136 * removes index from the index block. 2103 * removes index from the index block.
2137 * It's used in truncate case only, thus all requests are for
2138 * last index in the block only.
2139 */ 2104 */
2140static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 2105static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2141 struct ext4_ext_path *path) 2106 struct ext4_ext_path *path)
@@ -2153,6 +2118,13 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2153 err = ext4_ext_get_access(handle, inode, path); 2118 err = ext4_ext_get_access(handle, inode, path);
2154 if (err) 2119 if (err)
2155 return err; 2120 return err;
2121
2122 if (path->p_idx != EXT_LAST_INDEX(path->p_hdr)) {
2123 int len = EXT_LAST_INDEX(path->p_hdr) - path->p_idx;
2124 len *= sizeof(struct ext4_extent_idx);
2125 memmove(path->p_idx, path->p_idx + 1, len);
2126 }
2127
2156 le16_add_cpu(&path->p_hdr->eh_entries, -1); 2128 le16_add_cpu(&path->p_hdr->eh_entries, -1);
2157 err = ext4_ext_dirty(handle, inode, path); 2129 err = ext4_ext_dirty(handle, inode, path);
2158 if (err) 2130 if (err)
@@ -2534,8 +2506,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path)
2534 return 1; 2506 return 1;
2535} 2507}
2536 2508
2537static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, 2509static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2538 ext4_lblk_t end)
2539{ 2510{
2540 struct super_block *sb = inode->i_sb; 2511 struct super_block *sb = inode->i_sb;
2541 int depth = ext_depth(inode); 2512 int depth = ext_depth(inode);
@@ -2575,7 +2546,7 @@ again:
2575 if (i == depth) { 2546 if (i == depth) {
2576 /* this is leaf block */ 2547 /* this is leaf block */
2577 err = ext4_ext_rm_leaf(handle, inode, path, 2548 err = ext4_ext_rm_leaf(handle, inode, path,
2578 start, end); 2549 start, EXT_MAX_BLOCKS - 1);
2579 /* root level has p_bh == NULL, brelse() eats this */ 2550 /* root level has p_bh == NULL, brelse() eats this */
2580 brelse(path[i].p_bh); 2551 brelse(path[i].p_bh);
2581 path[i].p_bh = NULL; 2552 path[i].p_bh = NULL;
@@ -3107,12 +3078,10 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
3107 struct ext4_ext_path *path) 3078 struct ext4_ext_path *path)
3108{ 3079{
3109 struct ext4_extent *ex; 3080 struct ext4_extent *ex;
3110 struct ext4_extent_header *eh;
3111 int depth; 3081 int depth;
3112 int err = 0; 3082 int err = 0;
3113 3083
3114 depth = ext_depth(inode); 3084 depth = ext_depth(inode);
3115 eh = path[depth].p_hdr;
3116 ex = path[depth].p_ext; 3085 ex = path[depth].p_ext;
3117 3086
3118 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" 3087 ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical"
@@ -3357,8 +3326,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3357 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); 3326 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3358 3327
3359 /* check in cache */ 3328 /* check in cache */
3360 if (ext4_ext_in_cache(inode, map->m_lblk, &newex) && 3329 if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
3361 ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0)) { 3330 ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
3362 if (!newex.ee_start_lo && !newex.ee_start_hi) { 3331 if (!newex.ee_start_lo && !newex.ee_start_hi) {
3363 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3332 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
3364 /* 3333 /*
@@ -3497,8 +3466,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3497 3466
3498 ext4_ext_mark_uninitialized(ex); 3467 ext4_ext_mark_uninitialized(ex);
3499 3468
3500 err = ext4_ext_remove_space(inode, map->m_lblk, 3469 ext4_ext_invalidate_cache(inode);
3501 map->m_lblk + punched_out); 3470
3471 err = ext4_ext_rm_leaf(handle, inode, path,
3472 map->m_lblk, map->m_lblk + punched_out);
3473
3474 if (!err && path->p_hdr->eh_entries == 0) {
3475 /*
3476 * Punch hole freed all of this sub tree,
3477 * so we need to correct eh_depth
3478 */
3479 err = ext4_ext_get_access(handle, inode, path);
3480 if (err == 0) {
3481 ext_inode_hdr(inode)->eh_depth = 0;
3482 ext_inode_hdr(inode)->eh_max =
3483 cpu_to_le16(ext4_ext_space_root(
3484 inode, 0));
3485
3486 err = ext4_ext_dirty(
3487 handle, inode, path);
3488 }
3489 }
3502 3490
3503 goto out2; 3491 goto out2;
3504 } 3492 }
@@ -3596,17 +3584,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3596 } 3584 }
3597 3585
3598 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); 3586 err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len);
3599 if (err) 3587 if (!err)
3600 goto out2; 3588 err = ext4_ext_insert_extent(handle, inode, path,
3601 3589 &newex, flags);
3602 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3603 if (err) { 3590 if (err) {
3591 int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
3592 EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
3604 /* free data blocks we just allocated */ 3593 /* free data blocks we just allocated */
3605 /* not a good idea to call discard here directly, 3594 /* not a good idea to call discard here directly,
3606 * but otherwise we'd need to call it every free() */ 3595 * but otherwise we'd need to call it every free() */
3607 ext4_discard_preallocations(inode); 3596 ext4_discard_preallocations(inode);
3608 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 3597 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3609 ext4_ext_get_actual_len(&newex), 0); 3598 ext4_ext_get_actual_len(&newex), fb_flags);
3610 goto out2; 3599 goto out2;
3611 } 3600 }
3612 3601
@@ -3699,7 +3688,7 @@ void ext4_ext_truncate(struct inode *inode)
3699 3688
3700 last_block = (inode->i_size + sb->s_blocksize - 1) 3689 last_block = (inode->i_size + sb->s_blocksize - 1)
3701 >> EXT4_BLOCK_SIZE_BITS(sb); 3690 >> EXT4_BLOCK_SIZE_BITS(sb);
3702 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 3691 err = ext4_ext_remove_space(inode, last_block);
3703 3692
3704 /* In a multi-transaction truncate, we only make the final 3693 /* In a multi-transaction truncate, we only make the final
3705 * transaction synchronous. 3694 * transaction synchronous.
@@ -3835,7 +3824,7 @@ retry:
3835 blkbits) >> blkbits)) 3824 blkbits) >> blkbits))
3836 new_size = offset + len; 3825 new_size = offset + len;
3837 else 3826 else
3838 new_size = (map.m_lblk + ret) << blkbits; 3827 new_size = ((loff_t) map.m_lblk + ret) << blkbits;
3839 3828
3840 ext4_falloc_update_inode(inode, mode, new_size, 3829 ext4_falloc_update_inode(inode, mode, new_size,
3841 (map.m_flags & EXT4_MAP_NEW)); 3830 (map.m_flags & EXT4_MAP_NEW));
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index da3bed3e0c29..036f78f7a1ef 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -129,15 +129,30 @@ static int ext4_sync_parent(struct inode *inode)
129{ 129{
130 struct writeback_control wbc; 130 struct writeback_control wbc;
131 struct dentry *dentry = NULL; 131 struct dentry *dentry = NULL;
132 struct inode *next;
132 int ret = 0; 133 int ret = 0;
133 134
134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { 135 if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
136 return 0;
137 inode = igrab(inode);
138 while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); 139 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
136 dentry = list_entry(inode->i_dentry.next, 140 dentry = NULL;
137 struct dentry, d_alias); 141 spin_lock(&inode->i_lock);
138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) 142 if (!list_empty(&inode->i_dentry)) {
143 dentry = list_first_entry(&inode->i_dentry,
144 struct dentry, d_alias);
145 dget(dentry);
146 }
147 spin_unlock(&inode->i_lock);
148 if (!dentry)
139 break; 149 break;
140 inode = dentry->d_parent->d_inode; 150 next = igrab(dentry->d_parent->d_inode);
151 dput(dentry);
152 if (!next)
153 break;
154 iput(inode);
155 inode = next;
141 ret = sync_mapping_buffers(inode->i_mapping); 156 ret = sync_mapping_buffers(inode->i_mapping);
142 if (ret) 157 if (ret)
143 break; 158 break;
@@ -148,6 +163,7 @@ static int ext4_sync_parent(struct inode *inode)
148 if (ret) 163 if (ret)
149 break; 164 break;
150 } 165 }
166 iput(inode);
151 return ret; 167 return ret;
152} 168}
153 169
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21bb2f61e502..9c63f273b550 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1287,7 +1287,7 @@ extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
1287 group, used_blks, 1287 group, used_blks,
1288 ext4_itable_unused_count(sb, gdp)); 1288 ext4_itable_unused_count(sb, gdp));
1289 ret = 1; 1289 ret = 1;
1290 goto out; 1290 goto err_out;
1291 } 1291 }
1292 1292
1293 blk = ext4_inode_table(sb, gdp) + used_blks; 1293 blk = ext4_inode_table(sb, gdp) + used_blks;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
new file mode 100644
index 000000000000..b8602cde5b5a
--- /dev/null
+++ b/fs/ext4/indirect.c
@@ -0,0 +1,1482 @@
1/*
2 * linux/fs/ext4/indirect.c
3 *
4 * from
5 *
6 * linux/fs/ext4/inode.c
7 *
8 * Copyright (C) 1992, 1993, 1994, 1995
9 * Remy Card (card@masi.ibp.fr)
10 * Laboratoire MASI - Institut Blaise Pascal
11 * Universite Pierre et Marie Curie (Paris VI)
12 *
13 * from
14 *
15 * linux/fs/minix/inode.c
16 *
17 * Copyright (C) 1991, 1992 Linus Torvalds
18 *
19 * Goal-directed block allocation by Stephen Tweedie
20 * (sct@redhat.com), 1993, 1998
21 */
22
23#include <linux/module.h>
24#include "ext4_jbd2.h"
25#include "truncate.h"
26
27#include <trace/events/ext4.h>
28
29typedef struct {
30 __le32 *p;
31 __le32 key;
32 struct buffer_head *bh;
33} Indirect;
34
35static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
36{
37 p->key = *(p->p = v);
38 p->bh = bh;
39}
40
41/**
42 * ext4_block_to_path - parse the block number into array of offsets
43 * @inode: inode in question (we are only interested in its superblock)
44 * @i_block: block number to be parsed
45 * @offsets: array to store the offsets in
46 * @boundary: set this non-zero if the referred-to block is likely to be
47 * followed (on disk) by an indirect block.
48 *
49 * To store the locations of file's data ext4 uses a data structure common
50 * for UNIX filesystems - tree of pointers anchored in the inode, with
51 * data blocks at leaves and indirect blocks in intermediate nodes.
52 * This function translates the block number into path in that tree -
53 * return value is the path length and @offsets[n] is the offset of
54 * pointer to (n+1)th node in the nth one. If @block is out of range
55 * (negative or too large) warning is printed and zero returned.
56 *
57 * Note: function doesn't find node addresses, so no IO is needed. All
58 * we need to know is the capacity of indirect blocks (taken from the
59 * inode->i_sb).
60 */
61
62/*
63 * Portability note: the last comparison (check that we fit into triple
64 * indirect block) is spelled differently, because otherwise on an
65 * architecture with 32-bit longs and 8Kb pages we might get into trouble
66 * if our filesystem had 8Kb blocks. We might use long long, but that would
67 * kill us on x86. Oh, well, at least the sign propagation does not matter -
68 * i_block would have to be negative in the very beginning, so we would not
69 * get there at all.
70 */
71
72static int ext4_block_to_path(struct inode *inode,
73 ext4_lblk_t i_block,
74 ext4_lblk_t offsets[4], int *boundary)
75{
76 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
77 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
78 const long direct_blocks = EXT4_NDIR_BLOCKS,
79 indirect_blocks = ptrs,
80 double_blocks = (1 << (ptrs_bits * 2));
81 int n = 0;
82 int final = 0;
83
84 if (i_block < direct_blocks) {
85 offsets[n++] = i_block;
86 final = direct_blocks;
87 } else if ((i_block -= direct_blocks) < indirect_blocks) {
88 offsets[n++] = EXT4_IND_BLOCK;
89 offsets[n++] = i_block;
90 final = ptrs;
91 } else if ((i_block -= indirect_blocks) < double_blocks) {
92 offsets[n++] = EXT4_DIND_BLOCK;
93 offsets[n++] = i_block >> ptrs_bits;
94 offsets[n++] = i_block & (ptrs - 1);
95 final = ptrs;
96 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
97 offsets[n++] = EXT4_TIND_BLOCK;
98 offsets[n++] = i_block >> (ptrs_bits * 2);
99 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
100 offsets[n++] = i_block & (ptrs - 1);
101 final = ptrs;
102 } else {
103 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
104 i_block + direct_blocks +
105 indirect_blocks + double_blocks, inode->i_ino);
106 }
107 if (boundary)
108 *boundary = final - 1 - (i_block & (ptrs - 1));
109 return n;
110}
111
112/**
113 * ext4_get_branch - read the chain of indirect blocks leading to data
114 * @inode: inode in question
115 * @depth: depth of the chain (1 - direct pointer, etc.)
116 * @offsets: offsets of pointers in inode/indirect blocks
117 * @chain: place to store the result
118 * @err: here we store the error value
119 *
120 * Function fills the array of triples <key, p, bh> and returns %NULL
121 * if everything went OK or the pointer to the last filled triple
122 * (incomplete one) otherwise. Upon the return chain[i].key contains
123 * the number of (i+1)-th block in the chain (as it is stored in memory,
124 * i.e. little-endian 32-bit), chain[i].p contains the address of that
125 * number (it points into struct inode for i==0 and into the bh->b_data
126 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
127 * block for i>0 and NULL for i==0. In other words, it holds the block
128 * numbers of the chain, addresses they were taken from (and where we can
129 * verify that chain did not change) and buffer_heads hosting these
130 * numbers.
131 *
132 * Function stops when it stumbles upon zero pointer (absent block)
133 * (pointer to last triple returned, *@err == 0)
134 * or when it gets an IO error reading an indirect block
135 * (ditto, *@err == -EIO)
136 * or when it reads all @depth-1 indirect blocks successfully and finds
137 * the whole chain, all way to the data (returns %NULL, *err == 0).
138 *
139 * Need to be called with
140 * down_read(&EXT4_I(inode)->i_data_sem)
141 */
142static Indirect *ext4_get_branch(struct inode *inode, int depth,
143 ext4_lblk_t *offsets,
144 Indirect chain[4], int *err)
145{
146 struct super_block *sb = inode->i_sb;
147 Indirect *p = chain;
148 struct buffer_head *bh;
149
150 *err = 0;
151 /* i_data is not going away, no lock needed */
152 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
153 if (!p->key)
154 goto no_block;
155 while (--depth) {
156 bh = sb_getblk(sb, le32_to_cpu(p->key));
157 if (unlikely(!bh))
158 goto failure;
159
160 if (!bh_uptodate_or_lock(bh)) {
161 if (bh_submit_read(bh) < 0) {
162 put_bh(bh);
163 goto failure;
164 }
165 /* validate block references */
166 if (ext4_check_indirect_blockref(inode, bh)) {
167 put_bh(bh);
168 goto failure;
169 }
170 }
171
172 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
173 /* Reader: end */
174 if (!p->key)
175 goto no_block;
176 }
177 return NULL;
178
179failure:
180 *err = -EIO;
181no_block:
182 return p;
183}
184
185/**
186 * ext4_find_near - find a place for allocation with sufficient locality
187 * @inode: owner
188 * @ind: descriptor of indirect block.
189 *
190 * This function returns the preferred place for block allocation.
191 * It is used when heuristic for sequential allocation fails.
192 * Rules are:
193 * + if there is a block to the left of our position - allocate near it.
194 * + if pointer will live in indirect block - allocate near that block.
195 * + if pointer will live in inode - allocate in the same
196 * cylinder group.
197 *
198 * In the latter case we colour the starting block by the callers PID to
199 * prevent it from clashing with concurrent allocations for a different inode
200 * in the same block group. The PID is used here so that functionally related
201 * files will be close-by on-disk.
202 *
203 * Caller must make sure that @ind is valid and will stay that way.
204 */
205static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
206{
207 struct ext4_inode_info *ei = EXT4_I(inode);
208 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
209 __le32 *p;
210
211 /* Try to find previous block */
212 for (p = ind->p - 1; p >= start; p--) {
213 if (*p)
214 return le32_to_cpu(*p);
215 }
216
217 /* No such thing, so let's try location of indirect block */
218 if (ind->bh)
219 return ind->bh->b_blocknr;
220
221 /*
222 * It is going to be referred to from the inode itself? OK, just put it
223 * into the same cylinder group then.
224 */
225 return ext4_inode_to_goal_block(inode);
226}
227
228/**
229 * ext4_find_goal - find a preferred place for allocation.
230 * @inode: owner
231 * @block: block we want
232 * @partial: pointer to the last triple within a chain
233 *
234 * Normally this function find the preferred place for block allocation,
235 * returns it.
236 * Because this is only used for non-extent files, we limit the block nr
237 * to 32 bits.
238 */
239static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
240 Indirect *partial)
241{
242 ext4_fsblk_t goal;
243
244 /*
245 * XXX need to get goal block from mballoc's data structures
246 */
247
248 goal = ext4_find_near(inode, partial);
249 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
250 return goal;
251}
252
253/**
254 * ext4_blks_to_allocate - Look up the block map and count the number
255 * of direct blocks need to be allocated for the given branch.
256 *
257 * @branch: chain of indirect blocks
258 * @k: number of blocks need for indirect blocks
259 * @blks: number of data blocks to be mapped.
260 * @blocks_to_boundary: the offset in the indirect block
261 *
262 * return the total number of blocks to be allocate, including the
263 * direct and indirect blocks.
264 */
265static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
266 int blocks_to_boundary)
267{
268 unsigned int count = 0;
269
270 /*
271 * Simple case, [t,d]Indirect block(s) has not allocated yet
272 * then it's clear blocks on that path have not allocated
273 */
274 if (k > 0) {
275 /* right now we don't handle cross boundary allocation */
276 if (blks < blocks_to_boundary + 1)
277 count += blks;
278 else
279 count += blocks_to_boundary + 1;
280 return count;
281 }
282
283 count++;
284 while (count < blks && count <= blocks_to_boundary &&
285 le32_to_cpu(*(branch[0].p + count)) == 0) {
286 count++;
287 }
288 return count;
289}
290
291/**
292 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
293 * @handle: handle for this transaction
294 * @inode: inode which needs allocated blocks
295 * @iblock: the logical block to start allocated at
296 * @goal: preferred physical block of allocation
297 * @indirect_blks: the number of blocks need to allocate for indirect
298 * blocks
299 * @blks: number of desired blocks
300 * @new_blocks: on return it will store the new block numbers for
301 * the indirect blocks(if needed) and the first direct block,
302 * @err: on return it will store the error code
303 *
304 * This function will return the number of blocks allocated as
305 * requested by the passed-in parameters.
306 */
307static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
308 ext4_lblk_t iblock, ext4_fsblk_t goal,
309 int indirect_blks, int blks,
310 ext4_fsblk_t new_blocks[4], int *err)
311{
312 struct ext4_allocation_request ar;
313 int target, i;
314 unsigned long count = 0, blk_allocated = 0;
315 int index = 0;
316 ext4_fsblk_t current_block = 0;
317 int ret = 0;
318
319 /*
320 * Here we try to allocate the requested multiple blocks at once,
321 * on a best-effort basis.
322 * To build a branch, we should allocate blocks for
323 * the indirect blocks(if not allocated yet), and at least
324 * the first direct block of this branch. That's the
325 * minimum number of blocks need to allocate(required)
326 */
327 /* first we try to allocate the indirect blocks */
328 target = indirect_blks;
329 while (target > 0) {
330 count = target;
331 /* allocating blocks for indirect blocks and direct blocks */
332 current_block = ext4_new_meta_blocks(handle, inode, goal,
333 0, &count, err);
334 if (*err)
335 goto failed_out;
336
337 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
338 EXT4_ERROR_INODE(inode,
339 "current_block %llu + count %lu > %d!",
340 current_block, count,
341 EXT4_MAX_BLOCK_FILE_PHYS);
342 *err = -EIO;
343 goto failed_out;
344 }
345
346 target -= count;
347 /* allocate blocks for indirect blocks */
348 while (index < indirect_blks && count) {
349 new_blocks[index++] = current_block++;
350 count--;
351 }
352 if (count > 0) {
353 /*
354 * save the new block number
355 * for the first direct block
356 */
357 new_blocks[index] = current_block;
358 printk(KERN_INFO "%s returned more blocks than "
359 "requested\n", __func__);
360 WARN_ON(1);
361 break;
362 }
363 }
364
365 target = blks - count ;
366 blk_allocated = count;
367 if (!target)
368 goto allocated;
369 /* Now allocate data blocks */
370 memset(&ar, 0, sizeof(ar));
371 ar.inode = inode;
372 ar.goal = goal;
373 ar.len = target;
374 ar.logical = iblock;
375 if (S_ISREG(inode->i_mode))
376 /* enable in-core preallocation only for regular files */
377 ar.flags = EXT4_MB_HINT_DATA;
378
379 current_block = ext4_mb_new_blocks(handle, &ar, err);
380 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
381 EXT4_ERROR_INODE(inode,
382 "current_block %llu + ar.len %d > %d!",
383 current_block, ar.len,
384 EXT4_MAX_BLOCK_FILE_PHYS);
385 *err = -EIO;
386 goto failed_out;
387 }
388
389 if (*err && (target == blks)) {
390 /*
391 * if the allocation failed and we didn't allocate
392 * any blocks before
393 */
394 goto failed_out;
395 }
396 if (!*err) {
397 if (target == blks) {
398 /*
399 * save the new block number
400 * for the first direct block
401 */
402 new_blocks[index] = current_block;
403 }
404 blk_allocated += ar.len;
405 }
406allocated:
407 /* total number of blocks allocated for direct blocks */
408 ret = blk_allocated;
409 *err = 0;
410 return ret;
411failed_out:
412 for (i = 0; i < index; i++)
413 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
414 return ret;
415}
416
417/**
418 * ext4_alloc_branch - allocate and set up a chain of blocks.
419 * @handle: handle for this transaction
420 * @inode: owner
421 * @indirect_blks: number of allocated indirect blocks
422 * @blks: number of allocated direct blocks
423 * @goal: preferred place for allocation
424 * @offsets: offsets (in the blocks) to store the pointers to next.
425 * @branch: place to store the chain in.
426 *
427 * This function allocates blocks, zeroes out all but the last one,
428 * links them into chain and (if we are synchronous) writes them to disk.
429 * In other words, it prepares a branch that can be spliced onto the
430 * inode. It stores the information about that chain in the branch[], in
431 * the same format as ext4_get_branch() would do. We are calling it after
432 * we had read the existing part of chain and partial points to the last
433 * triple of that (one with zero ->key). Upon the exit we have the same
434 * picture as after the successful ext4_get_block(), except that in one
435 * place chain is disconnected - *branch->p is still zero (we did not
436 * set the last link), but branch->key contains the number that should
437 * be placed into *branch->p to fill that gap.
438 *
439 * If allocation fails we free all blocks we've allocated (and forget
440 * their buffer_heads) and return the error value the from failed
441 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
442 * as described above and return 0.
443 */
444static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
445 ext4_lblk_t iblock, int indirect_blks,
446 int *blks, ext4_fsblk_t goal,
447 ext4_lblk_t *offsets, Indirect *branch)
448{
449 int blocksize = inode->i_sb->s_blocksize;
450 int i, n = 0;
451 int err = 0;
452 struct buffer_head *bh;
453 int num;
454 ext4_fsblk_t new_blocks[4];
455 ext4_fsblk_t current_block;
456
457 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
458 *blks, new_blocks, &err);
459 if (err)
460 return err;
461
462 branch[0].key = cpu_to_le32(new_blocks[0]);
463 /*
464 * metadata blocks and data blocks are allocated.
465 */
466 for (n = 1; n <= indirect_blks; n++) {
467 /*
468 * Get buffer_head for parent block, zero it out
469 * and set the pointer to new one, then send
470 * parent to disk.
471 */
472 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
473 if (unlikely(!bh)) {
474 err = -EIO;
475 goto failed;
476 }
477
478 branch[n].bh = bh;
479 lock_buffer(bh);
480 BUFFER_TRACE(bh, "call get_create_access");
481 err = ext4_journal_get_create_access(handle, bh);
482 if (err) {
483 /* Don't brelse(bh) here; it's done in
484 * ext4_journal_forget() below */
485 unlock_buffer(bh);
486 goto failed;
487 }
488
489 memset(bh->b_data, 0, blocksize);
490 branch[n].p = (__le32 *) bh->b_data + offsets[n];
491 branch[n].key = cpu_to_le32(new_blocks[n]);
492 *branch[n].p = branch[n].key;
493 if (n == indirect_blks) {
494 current_block = new_blocks[n];
495 /*
496 * End of chain, update the last new metablock of
497 * the chain to point to the new allocated
498 * data blocks numbers
499 */
500 for (i = 1; i < num; i++)
501 *(branch[n].p + i) = cpu_to_le32(++current_block);
502 }
503 BUFFER_TRACE(bh, "marking uptodate");
504 set_buffer_uptodate(bh);
505 unlock_buffer(bh);
506
507 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
508 err = ext4_handle_dirty_metadata(handle, inode, bh);
509 if (err)
510 goto failed;
511 }
512 *blks = num;
513 return err;
514failed:
515 /* Allocation failed, free what we already allocated */
516 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
517 for (i = 1; i <= n ; i++) {
518 /*
519 * branch[i].bh is newly allocated, so there is no
520 * need to revoke the block, which is why we don't
521 * need to set EXT4_FREE_BLOCKS_METADATA.
522 */
523 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
524 EXT4_FREE_BLOCKS_FORGET);
525 }
526 for (i = n+1; i < indirect_blks; i++)
527 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
528
529 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
530
531 return err;
532}
533
534/**
535 * ext4_splice_branch - splice the allocated branch onto inode.
536 * @handle: handle for this transaction
537 * @inode: owner
538 * @block: (logical) number of block we are adding
539 * @chain: chain of indirect blocks (with a missing link - see
540 * ext4_alloc_branch)
541 * @where: location of missing link
542 * @num: number of indirect blocks we are adding
543 * @blks: number of direct blocks we are adding
544 *
545 * This function fills the missing link and does all housekeeping needed in
546 * inode (->i_blocks, etc.). In case of success we end up with the full
547 * chain to new block and return 0.
548 */
549static int ext4_splice_branch(handle_t *handle, struct inode *inode,
550 ext4_lblk_t block, Indirect *where, int num,
551 int blks)
552{
553 int i;
554 int err = 0;
555 ext4_fsblk_t current_block;
556
557 /*
558 * If we're splicing into a [td]indirect block (as opposed to the
559 * inode) then we need to get write access to the [td]indirect block
560 * before the splice.
561 */
562 if (where->bh) {
563 BUFFER_TRACE(where->bh, "get_write_access");
564 err = ext4_journal_get_write_access(handle, where->bh);
565 if (err)
566 goto err_out;
567 }
568 /* That's it */
569
570 *where->p = where->key;
571
572 /*
573 * Update the host buffer_head or inode to point to more just allocated
574 * direct blocks blocks
575 */
576 if (num == 0 && blks > 1) {
577 current_block = le32_to_cpu(where->key) + 1;
578 for (i = 1; i < blks; i++)
579 *(where->p + i) = cpu_to_le32(current_block++);
580 }
581
582 /* We are done with atomic stuff, now do the rest of housekeeping */
583 /* had we spliced it onto indirect block? */
584 if (where->bh) {
585 /*
586 * If we spliced it onto an indirect block, we haven't
587 * altered the inode. Note however that if it is being spliced
588 * onto an indirect block at the very end of the file (the
589 * file is growing) then we *will* alter the inode to reflect
590 * the new i_size. But that is not done here - it is done in
591 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
592 */
593 jbd_debug(5, "splicing indirect only\n");
594 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
595 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
596 if (err)
597 goto err_out;
598 } else {
599 /*
600 * OK, we spliced it into the inode itself on a direct block.
601 */
602 ext4_mark_inode_dirty(handle, inode);
603 jbd_debug(5, "splicing direct\n");
604 }
605 return err;
606
607err_out:
608 for (i = 1; i <= num; i++) {
609 /*
610 * branch[i].bh is newly allocated, so there is no
611 * need to revoke the block, which is why we don't
612 * need to set EXT4_FREE_BLOCKS_METADATA.
613 */
614 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
615 EXT4_FREE_BLOCKS_FORGET);
616 }
617 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
618 blks, 0);
619
620 return err;
621}
622
623/*
624 * The ext4_ind_map_blocks() function handles non-extents inodes
625 * (i.e., using the traditional indirect/double-indirect i_blocks
626 * scheme) for ext4_map_blocks().
627 *
628 * Allocation strategy is simple: if we have to allocate something, we will
629 * have to go the whole way to leaf. So let's do it before attaching anything
630 * to tree, set linkage between the newborn blocks, write them if sync is
631 * required, recheck the path, free and repeat if check fails, otherwise
632 * set the last missing link (that will protect us from any truncate-generated
633 * removals - all blocks on the path are immune now) and possibly force the
634 * write on the parent block.
635 * That has a nice additional property: no special recovery from the failed
636 * allocations is needed - we simply release blocks and do not touch anything
637 * reachable from inode.
638 *
639 * `handle' can be NULL if create == 0.
640 *
641 * return > 0, # of blocks mapped or allocated.
642 * return = 0, if plain lookup failed.
643 * return < 0, error case.
644 *
645 * The ext4_ind_get_blocks() function should be called with
646 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
647 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
648 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
649 * blocks.
650 */
651int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
652 struct ext4_map_blocks *map,
653 int flags)
654{
655 int err = -EIO;
656 ext4_lblk_t offsets[4];
657 Indirect chain[4];
658 Indirect *partial;
659 ext4_fsblk_t goal;
660 int indirect_blks;
661 int blocks_to_boundary = 0;
662 int depth;
663 int count = 0;
664 ext4_fsblk_t first_block = 0;
665
666 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
667 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
668 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
669 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
670 &blocks_to_boundary);
671
672 if (depth == 0)
673 goto out;
674
675 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
676
677 /* Simplest case - block found, no allocation needed */
678 if (!partial) {
679 first_block = le32_to_cpu(chain[depth - 1].key);
680 count++;
681 /*map more blocks*/
682 while (count < map->m_len && count <= blocks_to_boundary) {
683 ext4_fsblk_t blk;
684
685 blk = le32_to_cpu(*(chain[depth-1].p + count));
686
687 if (blk == first_block + count)
688 count++;
689 else
690 break;
691 }
692 goto got_it;
693 }
694
695 /* Next simple case - plain lookup or failed read of indirect block */
696 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
697 goto cleanup;
698
699 /*
700 * Okay, we need to do block allocation.
701 */
702 goal = ext4_find_goal(inode, map->m_lblk, partial);
703
704 /* the number of blocks need to allocate for [d,t]indirect blocks */
705 indirect_blks = (chain + depth) - partial - 1;
706
707 /*
708 * Next look up the indirect map to count the totoal number of
709 * direct blocks to allocate for this branch.
710 */
711 count = ext4_blks_to_allocate(partial, indirect_blks,
712 map->m_len, blocks_to_boundary);
713 /*
714 * Block out ext4_truncate while we alter the tree
715 */
716 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
717 &count, goal,
718 offsets + (partial - chain), partial);
719
720 /*
721 * The ext4_splice_branch call will free and forget any buffers
722 * on the new chain if there is a failure, but that risks using
723 * up transaction credits, especially for bitmaps where the
724 * credits cannot be returned. Can we handle this somehow? We
725 * may need to return -EAGAIN upwards in the worst case. --sct
726 */
727 if (!err)
728 err = ext4_splice_branch(handle, inode, map->m_lblk,
729 partial, indirect_blks, count);
730 if (err)
731 goto cleanup;
732
733 map->m_flags |= EXT4_MAP_NEW;
734
735 ext4_update_inode_fsync_trans(handle, inode, 1);
736got_it:
737 map->m_flags |= EXT4_MAP_MAPPED;
738 map->m_pblk = le32_to_cpu(chain[depth-1].key);
739 map->m_len = count;
740 if (count > blocks_to_boundary)
741 map->m_flags |= EXT4_MAP_BOUNDARY;
742 err = count;
743 /* Clean up and exit */
744 partial = chain + depth - 1; /* the whole chain */
745cleanup:
746 while (partial > chain) {
747 BUFFER_TRACE(partial->bh, "call brelse");
748 brelse(partial->bh);
749 partial--;
750 }
751out:
752 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
753 map->m_pblk, map->m_len, err);
754 return err;
755}
756
757/*
758 * O_DIRECT for ext3 (or indirect map) based files
759 *
760 * If the O_DIRECT write will extend the file then add this inode to the
761 * orphan list. So recovery will truncate it back to the original size
762 * if the machine crashes during the write.
763 *
764 * If the O_DIRECT write is intantiating holes inside i_size and the machine
765 * crashes then stale disk data _may_ be exposed inside the file. But current
766 * VFS code falls back into buffered path in that case so we are safe.
767 */
768ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
769 const struct iovec *iov, loff_t offset,
770 unsigned long nr_segs)
771{
772 struct file *file = iocb->ki_filp;
773 struct inode *inode = file->f_mapping->host;
774 struct ext4_inode_info *ei = EXT4_I(inode);
775 handle_t *handle;
776 ssize_t ret;
777 int orphan = 0;
778 size_t count = iov_length(iov, nr_segs);
779 int retries = 0;
780
781 if (rw == WRITE) {
782 loff_t final_size = offset + count;
783
784 if (final_size > inode->i_size) {
785 /* Credits for sb + inode write */
786 handle = ext4_journal_start(inode, 2);
787 if (IS_ERR(handle)) {
788 ret = PTR_ERR(handle);
789 goto out;
790 }
791 ret = ext4_orphan_add(handle, inode);
792 if (ret) {
793 ext4_journal_stop(handle);
794 goto out;
795 }
796 orphan = 1;
797 ei->i_disksize = inode->i_size;
798 ext4_journal_stop(handle);
799 }
800 }
801
802retry:
803 if (rw == READ && ext4_should_dioread_nolock(inode))
804 ret = __blockdev_direct_IO(rw, iocb, inode,
805 inode->i_sb->s_bdev, iov,
806 offset, nr_segs,
807 ext4_get_block, NULL, NULL, 0);
808 else {
809 ret = blockdev_direct_IO(rw, iocb, inode, iov,
810 offset, nr_segs, ext4_get_block);
811
812 if (unlikely((rw & WRITE) && ret < 0)) {
813 loff_t isize = i_size_read(inode);
814 loff_t end = offset + iov_length(iov, nr_segs);
815
816 if (end > isize)
817 ext4_truncate_failed_write(inode);
818 }
819 }
820 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
821 goto retry;
822
823 if (orphan) {
824 int err;
825
826 /* Credits for sb + inode write */
827 handle = ext4_journal_start(inode, 2);
828 if (IS_ERR(handle)) {
829 /* This is really bad luck. We've written the data
830 * but cannot extend i_size. Bail out and pretend
831 * the write failed... */
832 ret = PTR_ERR(handle);
833 if (inode->i_nlink)
834 ext4_orphan_del(NULL, inode);
835
836 goto out;
837 }
838 if (inode->i_nlink)
839 ext4_orphan_del(handle, inode);
840 if (ret > 0) {
841 loff_t end = offset + ret;
842 if (end > inode->i_size) {
843 ei->i_disksize = end;
844 i_size_write(inode, end);
845 /*
846 * We're going to return a positive `ret'
847 * here due to non-zero-length I/O, so there's
848 * no way of reporting error returns from
849 * ext4_mark_inode_dirty() to userspace. So
850 * ignore it.
851 */
852 ext4_mark_inode_dirty(handle, inode);
853 }
854 }
855 err = ext4_journal_stop(handle);
856 if (ret == 0)
857 ret = err;
858 }
859out:
860 return ret;
861}
862
863/*
864 * Calculate the number of metadata blocks need to reserve
865 * to allocate a new block at @lblocks for non extent file based file
866 */
867int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
868{
869 struct ext4_inode_info *ei = EXT4_I(inode);
870 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
871 int blk_bits;
872
873 if (lblock < EXT4_NDIR_BLOCKS)
874 return 0;
875
876 lblock -= EXT4_NDIR_BLOCKS;
877
878 if (ei->i_da_metadata_calc_len &&
879 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
880 ei->i_da_metadata_calc_len++;
881 return 0;
882 }
883 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
884 ei->i_da_metadata_calc_len = 1;
885 blk_bits = order_base_2(lblock);
886 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
887}
888
889int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk)
890{
891 int indirects;
892
893 /* if nrblocks are contiguous */
894 if (chunk) {
895 /*
896 * With N contiguous data blocks, we need at most
897 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
898 * 2 dindirect blocks, and 1 tindirect block
899 */
900 return DIV_ROUND_UP(nrblocks,
901 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
902 }
903 /*
904 * if nrblocks are not contiguous, worse case, each block touch
905 * a indirect block, and each indirect block touch a double indirect
906 * block, plus a triple indirect block
907 */
908 indirects = nrblocks * 2 + 1;
909 return indirects;
910}
911
912/*
913 * Truncate transactions can be complex and absolutely huge. So we need to
914 * be able to restart the transaction at a conventient checkpoint to make
915 * sure we don't overflow the journal.
916 *
917 * start_transaction gets us a new handle for a truncate transaction,
918 * and extend_transaction tries to extend the existing one a bit. If
919 * extend fails, we need to propagate the failure up and restart the
920 * transaction in the top-level truncate loop. --sct
921 */
922static handle_t *start_transaction(struct inode *inode)
923{
924 handle_t *result;
925
926 result = ext4_journal_start(inode, ext4_blocks_for_truncate(inode));
927 if (!IS_ERR(result))
928 return result;
929
930 ext4_std_error(inode->i_sb, PTR_ERR(result));
931 return result;
932}
933
934/*
935 * Try to extend this transaction for the purposes of truncation.
936 *
937 * Returns 0 if we managed to create more room. If we can't create more
938 * room, and the transaction must be restarted we return 1.
939 */
940static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
941{
942 if (!ext4_handle_valid(handle))
943 return 0;
944 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
945 return 0;
946 if (!ext4_journal_extend(handle, ext4_blocks_for_truncate(inode)))
947 return 0;
948 return 1;
949}
950
951/*
952 * Probably it should be a library function... search for first non-zero word
953 * or memcmp with zero_page, whatever is better for particular architecture.
954 * Linus?
955 */
956static inline int all_zeroes(__le32 *p, __le32 *q)
957{
958 while (p < q)
959 if (*p++)
960 return 0;
961 return 1;
962}
963
964/**
965 * ext4_find_shared - find the indirect blocks for partial truncation.
966 * @inode: inode in question
967 * @depth: depth of the affected branch
968 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
969 * @chain: place to store the pointers to partial indirect blocks
970 * @top: place to the (detached) top of branch
971 *
972 * This is a helper function used by ext4_truncate().
973 *
974 * When we do truncate() we may have to clean the ends of several
975 * indirect blocks but leave the blocks themselves alive. Block is
976 * partially truncated if some data below the new i_size is referred
977 * from it (and it is on the path to the first completely truncated
978 * data block, indeed). We have to free the top of that path along
979 * with everything to the right of the path. Since no allocation
980 * past the truncation point is possible until ext4_truncate()
981 * finishes, we may safely do the latter, but top of branch may
982 * require special attention - pageout below the truncation point
983 * might try to populate it.
984 *
985 * We atomically detach the top of branch from the tree, store the
986 * block number of its root in *@top, pointers to buffer_heads of
987 * partially truncated blocks - in @chain[].bh and pointers to
988 * their last elements that should not be removed - in
989 * @chain[].p. Return value is the pointer to last filled element
990 * of @chain.
991 *
992 * The work left to caller to do the actual freeing of subtrees:
993 * a) free the subtree starting from *@top
994 * b) free the subtrees whose roots are stored in
995 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
996 * c) free the subtrees growing from the inode past the @chain[0].
997 * (no partially truncated stuff there). */
998
999static Indirect *ext4_find_shared(struct inode *inode, int depth,
1000 ext4_lblk_t offsets[4], Indirect chain[4],
1001 __le32 *top)
1002{
1003 Indirect *partial, *p;
1004 int k, err;
1005
1006 *top = 0;
1007 /* Make k index the deepest non-null offset + 1 */
1008 for (k = depth; k > 1 && !offsets[k-1]; k--)
1009 ;
1010 partial = ext4_get_branch(inode, k, offsets, chain, &err);
1011 /* Writer: pointers */
1012 if (!partial)
1013 partial = chain + k-1;
1014 /*
1015 * If the branch acquired continuation since we've looked at it -
1016 * fine, it should all survive and (new) top doesn't belong to us.
1017 */
1018 if (!partial->key && *partial->p)
1019 /* Writer: end */
1020 goto no_top;
1021 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
1022 ;
1023 /*
1024 * OK, we've found the last block that must survive. The rest of our
1025 * branch should be detached before unlocking. However, if that rest
1026 * of branch is all ours and does not grow immediately from the inode
1027 * it's easier to cheat and just decrement partial->p.
1028 */
1029 if (p == chain + k - 1 && p > chain) {
1030 p->p--;
1031 } else {
1032 *top = *p->p;
1033 /* Nope, don't do this in ext4. Must leave the tree intact */
1034#if 0
1035 *p->p = 0;
1036#endif
1037 }
1038 /* Writer: end */
1039
1040 while (partial > p) {
1041 brelse(partial->bh);
1042 partial--;
1043 }
1044no_top:
1045 return partial;
1046}
1047
1048/*
1049 * Zero a number of block pointers in either an inode or an indirect block.
1050 * If we restart the transaction we must again get write access to the
1051 * indirect block for further modification.
1052 *
1053 * We release `count' blocks on disk, but (last - first) may be greater
1054 * than `count' because there can be holes in there.
1055 *
1056 * Return 0 on success, 1 on invalid block range
1057 * and < 0 on fatal error.
1058 */
1059static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
1060 struct buffer_head *bh,
1061 ext4_fsblk_t block_to_free,
1062 unsigned long count, __le32 *first,
1063 __le32 *last)
1064{
1065 __le32 *p;
1066 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
1067 int err;
1068
1069 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
1070 flags |= EXT4_FREE_BLOCKS_METADATA;
1071
1072 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
1073 count)) {
1074 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
1075 "blocks %llu len %lu",
1076 (unsigned long long) block_to_free, count);
1077 return 1;
1078 }
1079
1080 if (try_to_extend_transaction(handle, inode)) {
1081 if (bh) {
1082 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1083 err = ext4_handle_dirty_metadata(handle, inode, bh);
1084 if (unlikely(err))
1085 goto out_err;
1086 }
1087 err = ext4_mark_inode_dirty(handle, inode);
1088 if (unlikely(err))
1089 goto out_err;
1090 err = ext4_truncate_restart_trans(handle, inode,
1091 ext4_blocks_for_truncate(inode));
1092 if (unlikely(err))
1093 goto out_err;
1094 if (bh) {
1095 BUFFER_TRACE(bh, "retaking write access");
1096 err = ext4_journal_get_write_access(handle, bh);
1097 if (unlikely(err))
1098 goto out_err;
1099 }
1100 }
1101
1102 for (p = first; p < last; p++)
1103 *p = 0;
1104
1105 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
1106 return 0;
1107out_err:
1108 ext4_std_error(inode->i_sb, err);
1109 return err;
1110}
1111
1112/**
1113 * ext4_free_data - free a list of data blocks
1114 * @handle: handle for this transaction
1115 * @inode: inode we are dealing with
1116 * @this_bh: indirect buffer_head which contains *@first and *@last
1117 * @first: array of block numbers
1118 * @last: points immediately past the end of array
1119 *
1120 * We are freeing all blocks referred from that array (numbers are stored as
1121 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1122 *
1123 * We accumulate contiguous runs of blocks to free. Conveniently, if these
1124 * blocks are contiguous then releasing them at one time will only affect one
1125 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1126 * actually use a lot of journal space.
1127 *
1128 * @this_bh will be %NULL if @first and @last point into the inode's direct
1129 * block pointers.
1130 */
1131static void ext4_free_data(handle_t *handle, struct inode *inode,
1132 struct buffer_head *this_bh,
1133 __le32 *first, __le32 *last)
1134{
1135 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
1136 unsigned long count = 0; /* Number of blocks in the run */
1137 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
1138 corresponding to
1139 block_to_free */
1140 ext4_fsblk_t nr; /* Current block # */
1141 __le32 *p; /* Pointer into inode/ind
1142 for current block */
1143 int err = 0;
1144
1145 if (this_bh) { /* For indirect block */
1146 BUFFER_TRACE(this_bh, "get_write_access");
1147 err = ext4_journal_get_write_access(handle, this_bh);
1148 /* Important: if we can't update the indirect pointers
1149 * to the blocks, we can't free them. */
1150 if (err)
1151 return;
1152 }
1153
1154 for (p = first; p < last; p++) {
1155 nr = le32_to_cpu(*p);
1156 if (nr) {
1157 /* accumulate blocks to free if they're contiguous */
1158 if (count == 0) {
1159 block_to_free = nr;
1160 block_to_free_p = p;
1161 count = 1;
1162 } else if (nr == block_to_free + count) {
1163 count++;
1164 } else {
1165 err = ext4_clear_blocks(handle, inode, this_bh,
1166 block_to_free, count,
1167 block_to_free_p, p);
1168 if (err)
1169 break;
1170 block_to_free = nr;
1171 block_to_free_p = p;
1172 count = 1;
1173 }
1174 }
1175 }
1176
1177 if (!err && count > 0)
1178 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
1179 count, block_to_free_p, p);
1180 if (err < 0)
1181 /* fatal error */
1182 return;
1183
1184 if (this_bh) {
1185 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
1186
1187 /*
1188 * The buffer head should have an attached journal head at this
1189 * point. However, if the data is corrupted and an indirect
1190 * block pointed to itself, it would have been detached when
1191 * the block was cleared. Check for this instead of OOPSing.
1192 */
1193 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
1194 ext4_handle_dirty_metadata(handle, inode, this_bh);
1195 else
1196 EXT4_ERROR_INODE(inode,
1197 "circular indirect block detected at "
1198 "block %llu",
1199 (unsigned long long) this_bh->b_blocknr);
1200 }
1201}
1202
1203/**
1204 * ext4_free_branches - free an array of branches
1205 * @handle: JBD handle for this transaction
1206 * @inode: inode we are dealing with
1207 * @parent_bh: the buffer_head which contains *@first and *@last
1208 * @first: array of block numbers
1209 * @last: pointer immediately past the end of array
1210 * @depth: depth of the branches to free
1211 *
1212 * We are freeing all blocks referred from these branches (numbers are
1213 * stored as little-endian 32-bit) and updating @inode->i_blocks
1214 * appropriately.
1215 */
1216static void ext4_free_branches(handle_t *handle, struct inode *inode,
1217 struct buffer_head *parent_bh,
1218 __le32 *first, __le32 *last, int depth)
1219{
1220 ext4_fsblk_t nr;
1221 __le32 *p;
1222
1223 if (ext4_handle_is_aborted(handle))
1224 return;
1225
1226 if (depth--) {
1227 struct buffer_head *bh;
1228 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1229 p = last;
1230 while (--p >= first) {
1231 nr = le32_to_cpu(*p);
1232 if (!nr)
1233 continue; /* A hole */
1234
1235 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
1236 nr, 1)) {
1237 EXT4_ERROR_INODE(inode,
1238 "invalid indirect mapped "
1239 "block %lu (level %d)",
1240 (unsigned long) nr, depth);
1241 break;
1242 }
1243
1244 /* Go read the buffer for the next level down */
1245 bh = sb_bread(inode->i_sb, nr);
1246
1247 /*
1248 * A read failure? Report error and clear slot
1249 * (should be rare).
1250 */
1251 if (!bh) {
1252 EXT4_ERROR_INODE_BLOCK(inode, nr,
1253 "Read failure");
1254 continue;
1255 }
1256
1257 /* This zaps the entire block. Bottom up. */
1258 BUFFER_TRACE(bh, "free child branches");
1259 ext4_free_branches(handle, inode, bh,
1260 (__le32 *) bh->b_data,
1261 (__le32 *) bh->b_data + addr_per_block,
1262 depth);
1263 brelse(bh);
1264
1265 /*
1266 * Everything below this this pointer has been
1267 * released. Now let this top-of-subtree go.
1268 *
1269 * We want the freeing of this indirect block to be
1270 * atomic in the journal with the updating of the
1271 * bitmap block which owns it. So make some room in
1272 * the journal.
1273 *
1274 * We zero the parent pointer *after* freeing its
1275 * pointee in the bitmaps, so if extend_transaction()
1276 * for some reason fails to put the bitmap changes and
1277 * the release into the same transaction, recovery
1278 * will merely complain about releasing a free block,
1279 * rather than leaking blocks.
1280 */
1281 if (ext4_handle_is_aborted(handle))
1282 return;
1283 if (try_to_extend_transaction(handle, inode)) {
1284 ext4_mark_inode_dirty(handle, inode);
1285 ext4_truncate_restart_trans(handle, inode,
1286 ext4_blocks_for_truncate(inode));
1287 }
1288
1289 /*
1290 * The forget flag here is critical because if
1291 * we are journaling (and not doing data
1292 * journaling), we have to make sure a revoke
1293 * record is written to prevent the journal
1294 * replay from overwriting the (former)
1295 * indirect block if it gets reallocated as a
1296 * data block. This must happen in the same
1297 * transaction where the data blocks are
1298 * actually freed.
1299 */
1300 ext4_free_blocks(handle, inode, NULL, nr, 1,
1301 EXT4_FREE_BLOCKS_METADATA|
1302 EXT4_FREE_BLOCKS_FORGET);
1303
1304 if (parent_bh) {
1305 /*
1306 * The block which we have just freed is
1307 * pointed to by an indirect block: journal it
1308 */
1309 BUFFER_TRACE(parent_bh, "get_write_access");
1310 if (!ext4_journal_get_write_access(handle,
1311 parent_bh)){
1312 *p = 0;
1313 BUFFER_TRACE(parent_bh,
1314 "call ext4_handle_dirty_metadata");
1315 ext4_handle_dirty_metadata(handle,
1316 inode,
1317 parent_bh);
1318 }
1319 }
1320 }
1321 } else {
1322 /* We have reached the bottom of the tree. */
1323 BUFFER_TRACE(parent_bh, "free data blocks");
1324 ext4_free_data(handle, inode, parent_bh, first, last);
1325 }
1326}
1327
1328void ext4_ind_truncate(struct inode *inode)
1329{
1330 handle_t *handle;
1331 struct ext4_inode_info *ei = EXT4_I(inode);
1332 __le32 *i_data = ei->i_data;
1333 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1334 struct address_space *mapping = inode->i_mapping;
1335 ext4_lblk_t offsets[4];
1336 Indirect chain[4];
1337 Indirect *partial;
1338 __le32 nr = 0;
1339 int n = 0;
1340 ext4_lblk_t last_block, max_block;
1341 unsigned blocksize = inode->i_sb->s_blocksize;
1342
1343 handle = start_transaction(inode);
1344 if (IS_ERR(handle))
1345 return; /* AKPM: return what? */
1346
1347 last_block = (inode->i_size + blocksize-1)
1348 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1349 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1350 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1351
1352 if (inode->i_size & (blocksize - 1))
1353 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
1354 goto out_stop;
1355
1356 if (last_block != max_block) {
1357 n = ext4_block_to_path(inode, last_block, offsets, NULL);
1358 if (n == 0)
1359 goto out_stop; /* error */
1360 }
1361
1362 /*
1363 * OK. This truncate is going to happen. We add the inode to the
1364 * orphan list, so that if this truncate spans multiple transactions,
1365 * and we crash, we will resume the truncate when the filesystem
1366 * recovers. It also marks the inode dirty, to catch the new size.
1367 *
1368 * Implication: the file must always be in a sane, consistent
1369 * truncatable state while each transaction commits.
1370 */
1371 if (ext4_orphan_add(handle, inode))
1372 goto out_stop;
1373
1374 /*
1375 * From here we block out all ext4_get_block() callers who want to
1376 * modify the block allocation tree.
1377 */
1378 down_write(&ei->i_data_sem);
1379
1380 ext4_discard_preallocations(inode);
1381
1382 /*
1383 * The orphan list entry will now protect us from any crash which
1384 * occurs before the truncate completes, so it is now safe to propagate
1385 * the new, shorter inode size (held for now in i_size) into the
1386 * on-disk inode. We do this via i_disksize, which is the value which
1387 * ext4 *really* writes onto the disk inode.
1388 */
1389 ei->i_disksize = inode->i_size;
1390
1391 if (last_block == max_block) {
1392 /*
1393 * It is unnecessary to free any data blocks if last_block is
1394 * equal to the indirect block limit.
1395 */
1396 goto out_unlock;
1397 } else if (n == 1) { /* direct blocks */
1398 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
1399 i_data + EXT4_NDIR_BLOCKS);
1400 goto do_indirects;
1401 }
1402
1403 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1404 /* Kill the top of shared branch (not detached) */
1405 if (nr) {
1406 if (partial == chain) {
1407 /* Shared branch grows from the inode */
1408 ext4_free_branches(handle, inode, NULL,
1409 &nr, &nr+1, (chain+n-1) - partial);
1410 *partial->p = 0;
1411 /*
1412 * We mark the inode dirty prior to restart,
1413 * and prior to stop. No need for it here.
1414 */
1415 } else {
1416 /* Shared branch grows from an indirect block */
1417 BUFFER_TRACE(partial->bh, "get_write_access");
1418 ext4_free_branches(handle, inode, partial->bh,
1419 partial->p,
1420 partial->p+1, (chain+n-1) - partial);
1421 }
1422 }
1423 /* Clear the ends of indirect blocks on the shared branch */
1424 while (partial > chain) {
1425 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
1426 (__le32*)partial->bh->b_data+addr_per_block,
1427 (chain+n-1) - partial);
1428 BUFFER_TRACE(partial->bh, "call brelse");
1429 brelse(partial->bh);
1430 partial--;
1431 }
1432do_indirects:
1433 /* Kill the remaining (whole) subtrees */
1434 switch (offsets[0]) {
1435 default:
1436 nr = i_data[EXT4_IND_BLOCK];
1437 if (nr) {
1438 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
1439 i_data[EXT4_IND_BLOCK] = 0;
1440 }
1441 case EXT4_IND_BLOCK:
1442 nr = i_data[EXT4_DIND_BLOCK];
1443 if (nr) {
1444 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
1445 i_data[EXT4_DIND_BLOCK] = 0;
1446 }
1447 case EXT4_DIND_BLOCK:
1448 nr = i_data[EXT4_TIND_BLOCK];
1449 if (nr) {
1450 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
1451 i_data[EXT4_TIND_BLOCK] = 0;
1452 }
1453 case EXT4_TIND_BLOCK:
1454 ;
1455 }
1456
1457out_unlock:
1458 up_write(&ei->i_data_sem);
1459 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1460 ext4_mark_inode_dirty(handle, inode);
1461
1462 /*
1463 * In a multi-transaction truncate, we only make the final transaction
1464 * synchronous
1465 */
1466 if (IS_SYNC(inode))
1467 ext4_handle_sync(handle);
1468out_stop:
1469 /*
1470 * If this was a simple ftruncate(), and the file will remain alive
1471 * then we need to clear up the orphan record which we created above.
1472 * However, if this was a real unlink then we were called by
1473 * ext4_delete_inode(), and we allow that function to clean up the
1474 * orphan info for us.
1475 */
1476 if (inode->i_nlink)
1477 ext4_orphan_del(handle, inode);
1478
1479 ext4_journal_stop(handle);
1480 trace_ext4_truncate_exit(inode);
1481}
1482
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e5191f9f398..d47264cafee0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -12,10 +12,6 @@
12 * 12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds 13 * Copyright (C) 1991, 1992 Linus Torvalds
14 * 14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek 15 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz) 16 * (jj@sunsite.ms.mff.cuni.cz)
21 * 17 *
@@ -47,6 +43,7 @@
47#include "xattr.h" 43#include "xattr.h"
48#include "acl.h" 44#include "acl.h"
49#include "ext4_extents.h" 45#include "ext4_extents.h"
46#include "truncate.h"
50 47
51#include <trace/events/ext4.h> 48#include <trace/events/ext4.h>
52 49
@@ -89,72 +86,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
89} 86}
90 87
91/* 88/*
92 * Work out how many blocks we need to proceed with the next chunk of a
93 * truncate transaction.
94 */
95static unsigned long blocks_for_truncate(struct inode *inode)
96{
97 ext4_lblk_t needed;
98
99 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
100
101 /* Give ourselves just enough room to cope with inodes in which
102 * i_blocks is corrupt: we've seen disk corruptions in the past
103 * which resulted in random data in an inode which looked enough
104 * like a regular file for ext4 to try to delete it. Things
105 * will go a bit crazy if that happens, but at least we should
106 * try not to panic the whole kernel. */
107 if (needed < 2)
108 needed = 2;
109
110 /* But we need to bound the transaction so we don't overflow the
111 * journal. */
112 if (needed > EXT4_MAX_TRANS_DATA)
113 needed = EXT4_MAX_TRANS_DATA;
114
115 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
116}
117
118/*
119 * Truncate transactions can be complex and absolutely huge. So we need to
120 * be able to restart the transaction at a conventient checkpoint to make
121 * sure we don't overflow the journal.
122 *
123 * start_transaction gets us a new handle for a truncate transaction,
124 * and extend_transaction tries to extend the existing one a bit. If
125 * extend fails, we need to propagate the failure up and restart the
126 * transaction in the top-level truncate loop. --sct
127 */
128static handle_t *start_transaction(struct inode *inode)
129{
130 handle_t *result;
131
132 result = ext4_journal_start(inode, blocks_for_truncate(inode));
133 if (!IS_ERR(result))
134 return result;
135
136 ext4_std_error(inode->i_sb, PTR_ERR(result));
137 return result;
138}
139
140/*
141 * Try to extend this transaction for the purposes of truncation.
142 *
143 * Returns 0 if we managed to create more room. If we can't create more
144 * room, and the transaction must be restarted we return 1.
145 */
146static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
147{
148 if (!ext4_handle_valid(handle))
149 return 0;
150 if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1))
151 return 0;
152 if (!ext4_journal_extend(handle, blocks_for_truncate(inode)))
153 return 0;
154 return 1;
155}
156
157/*
158 * Restart the transaction associated with *handle. This does a commit, 89 * Restart the transaction associated with *handle. This does a commit,
159 * so before we call here everything must be consistently dirtied against 90 * so before we call here everything must be consistently dirtied against
160 * this transaction. 91 * this transaction.
@@ -190,6 +121,33 @@ void ext4_evict_inode(struct inode *inode)
190 121
191 trace_ext4_evict_inode(inode); 122 trace_ext4_evict_inode(inode);
192 if (inode->i_nlink) { 123 if (inode->i_nlink) {
124 /*
125 * When journalling data dirty buffers are tracked only in the
126 * journal. So although mm thinks everything is clean and
127 * ready for reaping the inode might still have some pages to
128 * write in the running transaction or waiting to be
129 * checkpointed. Thus calling jbd2_journal_invalidatepage()
130 * (via truncate_inode_pages()) to discard these buffers can
131 * cause data loss. Also even if we did not discard these
132 * buffers, we would have no way to find them after the inode
133 * is reaped and thus user could see stale data if he tries to
134 * read them before the transaction is checkpointed. So be
135 * careful and force everything to disk here... We use
136 * ei->i_datasync_tid to store the newest transaction
137 * containing inode's data.
138 *
139 * Note that directories do not have this problem because they
140 * don't use page cache.
141 */
142 if (ext4_should_journal_data(inode) &&
143 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
144 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
145 tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
146
147 jbd2_log_start_commit(journal, commit_tid);
148 jbd2_log_wait_commit(journal, commit_tid);
149 filemap_write_and_wait(&inode->i_data);
150 }
193 truncate_inode_pages(&inode->i_data, 0); 151 truncate_inode_pages(&inode->i_data, 0);
194 goto no_delete; 152 goto no_delete;
195 } 153 }
@@ -204,7 +162,7 @@ void ext4_evict_inode(struct inode *inode)
204 if (is_bad_inode(inode)) 162 if (is_bad_inode(inode))
205 goto no_delete; 163 goto no_delete;
206 164
207 handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); 165 handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
208 if (IS_ERR(handle)) { 166 if (IS_ERR(handle)) {
209 ext4_std_error(inode->i_sb, PTR_ERR(handle)); 167 ext4_std_error(inode->i_sb, PTR_ERR(handle));
210 /* 168 /*
@@ -277,793 +235,6 @@ no_delete:
277 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ 235 ext4_clear_inode(inode); /* We must guarantee clearing of inode... */
278} 236}
279 237
280typedef struct {
281 __le32 *p;
282 __le32 key;
283 struct buffer_head *bh;
284} Indirect;
285
286static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
287{
288 p->key = *(p->p = v);
289 p->bh = bh;
290}
291
292/**
293 * ext4_block_to_path - parse the block number into array of offsets
294 * @inode: inode in question (we are only interested in its superblock)
295 * @i_block: block number to be parsed
296 * @offsets: array to store the offsets in
297 * @boundary: set this non-zero if the referred-to block is likely to be
298 * followed (on disk) by an indirect block.
299 *
300 * To store the locations of file's data ext4 uses a data structure common
301 * for UNIX filesystems - tree of pointers anchored in the inode, with
302 * data blocks at leaves and indirect blocks in intermediate nodes.
303 * This function translates the block number into path in that tree -
304 * return value is the path length and @offsets[n] is the offset of
305 * pointer to (n+1)th node in the nth one. If @block is out of range
306 * (negative or too large) warning is printed and zero returned.
307 *
308 * Note: function doesn't find node addresses, so no IO is needed. All
309 * we need to know is the capacity of indirect blocks (taken from the
310 * inode->i_sb).
311 */
312
313/*
314 * Portability note: the last comparison (check that we fit into triple
315 * indirect block) is spelled differently, because otherwise on an
316 * architecture with 32-bit longs and 8Kb pages we might get into trouble
317 * if our filesystem had 8Kb blocks. We might use long long, but that would
318 * kill us on x86. Oh, well, at least the sign propagation does not matter -
319 * i_block would have to be negative in the very beginning, so we would not
320 * get there at all.
321 */
322
323static int ext4_block_to_path(struct inode *inode,
324 ext4_lblk_t i_block,
325 ext4_lblk_t offsets[4], int *boundary)
326{
327 int ptrs = EXT4_ADDR_PER_BLOCK(inode->i_sb);
328 int ptrs_bits = EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb);
329 const long direct_blocks = EXT4_NDIR_BLOCKS,
330 indirect_blocks = ptrs,
331 double_blocks = (1 << (ptrs_bits * 2));
332 int n = 0;
333 int final = 0;
334
335 if (i_block < direct_blocks) {
336 offsets[n++] = i_block;
337 final = direct_blocks;
338 } else if ((i_block -= direct_blocks) < indirect_blocks) {
339 offsets[n++] = EXT4_IND_BLOCK;
340 offsets[n++] = i_block;
341 final = ptrs;
342 } else if ((i_block -= indirect_blocks) < double_blocks) {
343 offsets[n++] = EXT4_DIND_BLOCK;
344 offsets[n++] = i_block >> ptrs_bits;
345 offsets[n++] = i_block & (ptrs - 1);
346 final = ptrs;
347 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
348 offsets[n++] = EXT4_TIND_BLOCK;
349 offsets[n++] = i_block >> (ptrs_bits * 2);
350 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
351 offsets[n++] = i_block & (ptrs - 1);
352 final = ptrs;
353 } else {
354 ext4_warning(inode->i_sb, "block %lu > max in inode %lu",
355 i_block + direct_blocks +
356 indirect_blocks + double_blocks, inode->i_ino);
357 }
358 if (boundary)
359 *boundary = final - 1 - (i_block & (ptrs - 1));
360 return n;
361}
362
363static int __ext4_check_blockref(const char *function, unsigned int line,
364 struct inode *inode,
365 __le32 *p, unsigned int max)
366{
367 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
368 __le32 *bref = p;
369 unsigned int blk;
370
371 while (bref < p+max) {
372 blk = le32_to_cpu(*bref++);
373 if (blk &&
374 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
375 blk, 1))) {
376 es->s_last_error_block = cpu_to_le64(blk);
377 ext4_error_inode(inode, function, line, blk,
378 "invalid block");
379 return -EIO;
380 }
381 }
382 return 0;
383}
384
385
386#define ext4_check_indirect_blockref(inode, bh) \
387 __ext4_check_blockref(__func__, __LINE__, inode, \
388 (__le32 *)(bh)->b_data, \
389 EXT4_ADDR_PER_BLOCK((inode)->i_sb))
390
391#define ext4_check_inode_blockref(inode) \
392 __ext4_check_blockref(__func__, __LINE__, inode, \
393 EXT4_I(inode)->i_data, \
394 EXT4_NDIR_BLOCKS)
395
396/**
397 * ext4_get_branch - read the chain of indirect blocks leading to data
398 * @inode: inode in question
399 * @depth: depth of the chain (1 - direct pointer, etc.)
400 * @offsets: offsets of pointers in inode/indirect blocks
401 * @chain: place to store the result
402 * @err: here we store the error value
403 *
404 * Function fills the array of triples <key, p, bh> and returns %NULL
405 * if everything went OK or the pointer to the last filled triple
406 * (incomplete one) otherwise. Upon the return chain[i].key contains
407 * the number of (i+1)-th block in the chain (as it is stored in memory,
408 * i.e. little-endian 32-bit), chain[i].p contains the address of that
409 * number (it points into struct inode for i==0 and into the bh->b_data
410 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
411 * block for i>0 and NULL for i==0. In other words, it holds the block
412 * numbers of the chain, addresses they were taken from (and where we can
413 * verify that chain did not change) and buffer_heads hosting these
414 * numbers.
415 *
416 * Function stops when it stumbles upon zero pointer (absent block)
417 * (pointer to last triple returned, *@err == 0)
418 * or when it gets an IO error reading an indirect block
419 * (ditto, *@err == -EIO)
420 * or when it reads all @depth-1 indirect blocks successfully and finds
421 * the whole chain, all way to the data (returns %NULL, *err == 0).
422 *
423 * Need to be called with
424 * down_read(&EXT4_I(inode)->i_data_sem)
425 */
426static Indirect *ext4_get_branch(struct inode *inode, int depth,
427 ext4_lblk_t *offsets,
428 Indirect chain[4], int *err)
429{
430 struct super_block *sb = inode->i_sb;
431 Indirect *p = chain;
432 struct buffer_head *bh;
433
434 *err = 0;
435 /* i_data is not going away, no lock needed */
436 add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
437 if (!p->key)
438 goto no_block;
439 while (--depth) {
440 bh = sb_getblk(sb, le32_to_cpu(p->key));
441 if (unlikely(!bh))
442 goto failure;
443
444 if (!bh_uptodate_or_lock(bh)) {
445 if (bh_submit_read(bh) < 0) {
446 put_bh(bh);
447 goto failure;
448 }
449 /* validate block references */
450 if (ext4_check_indirect_blockref(inode, bh)) {
451 put_bh(bh);
452 goto failure;
453 }
454 }
455
456 add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
457 /* Reader: end */
458 if (!p->key)
459 goto no_block;
460 }
461 return NULL;
462
463failure:
464 *err = -EIO;
465no_block:
466 return p;
467}
468
469/**
470 * ext4_find_near - find a place for allocation with sufficient locality
471 * @inode: owner
472 * @ind: descriptor of indirect block.
473 *
474 * This function returns the preferred place for block allocation.
475 * It is used when heuristic for sequential allocation fails.
476 * Rules are:
477 * + if there is a block to the left of our position - allocate near it.
478 * + if pointer will live in indirect block - allocate near that block.
479 * + if pointer will live in inode - allocate in the same
480 * cylinder group.
481 *
482 * In the latter case we colour the starting block by the callers PID to
483 * prevent it from clashing with concurrent allocations for a different inode
484 * in the same block group. The PID is used here so that functionally related
485 * files will be close-by on-disk.
486 *
487 * Caller must make sure that @ind is valid and will stay that way.
488 */
489static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
490{
491 struct ext4_inode_info *ei = EXT4_I(inode);
492 __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
493 __le32 *p;
494 ext4_fsblk_t bg_start;
495 ext4_fsblk_t last_block;
496 ext4_grpblk_t colour;
497 ext4_group_t block_group;
498 int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
499
500 /* Try to find previous block */
501 for (p = ind->p - 1; p >= start; p--) {
502 if (*p)
503 return le32_to_cpu(*p);
504 }
505
506 /* No such thing, so let's try location of indirect block */
507 if (ind->bh)
508 return ind->bh->b_blocknr;
509
510 /*
511 * It is going to be referred to from the inode itself? OK, just put it
512 * into the same cylinder group then.
513 */
514 block_group = ei->i_block_group;
515 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
516 block_group &= ~(flex_size-1);
517 if (S_ISREG(inode->i_mode))
518 block_group++;
519 }
520 bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
521 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
522
523 /*
524 * If we are doing delayed allocation, we don't need take
525 * colour into account.
526 */
527 if (test_opt(inode->i_sb, DELALLOC))
528 return bg_start;
529
530 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
531 colour = (current->pid % 16) *
532 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
533 else
534 colour = (current->pid % 16) * ((last_block - bg_start) / 16);
535 return bg_start + colour;
536}
537
538/**
539 * ext4_find_goal - find a preferred place for allocation.
540 * @inode: owner
541 * @block: block we want
542 * @partial: pointer to the last triple within a chain
543 *
544 * Normally this function find the preferred place for block allocation,
545 * returns it.
546 * Because this is only used for non-extent files, we limit the block nr
547 * to 32 bits.
548 */
549static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
550 Indirect *partial)
551{
552 ext4_fsblk_t goal;
553
554 /*
555 * XXX need to get goal block from mballoc's data structures
556 */
557
558 goal = ext4_find_near(inode, partial);
559 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
560 return goal;
561}
562
563/**
564 * ext4_blks_to_allocate - Look up the block map and count the number
565 * of direct blocks need to be allocated for the given branch.
566 *
567 * @branch: chain of indirect blocks
568 * @k: number of blocks need for indirect blocks
569 * @blks: number of data blocks to be mapped.
570 * @blocks_to_boundary: the offset in the indirect block
571 *
572 * return the total number of blocks to be allocate, including the
573 * direct and indirect blocks.
574 */
575static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks,
576 int blocks_to_boundary)
577{
578 unsigned int count = 0;
579
580 /*
581 * Simple case, [t,d]Indirect block(s) has not allocated yet
582 * then it's clear blocks on that path have not allocated
583 */
584 if (k > 0) {
585 /* right now we don't handle cross boundary allocation */
586 if (blks < blocks_to_boundary + 1)
587 count += blks;
588 else
589 count += blocks_to_boundary + 1;
590 return count;
591 }
592
593 count++;
594 while (count < blks && count <= blocks_to_boundary &&
595 le32_to_cpu(*(branch[0].p + count)) == 0) {
596 count++;
597 }
598 return count;
599}
600
601/**
602 * ext4_alloc_blocks: multiple allocate blocks needed for a branch
603 * @handle: handle for this transaction
604 * @inode: inode which needs allocated blocks
605 * @iblock: the logical block to start allocated at
606 * @goal: preferred physical block of allocation
607 * @indirect_blks: the number of blocks need to allocate for indirect
608 * blocks
609 * @blks: number of desired blocks
610 * @new_blocks: on return it will store the new block numbers for
611 * the indirect blocks(if needed) and the first direct block,
612 * @err: on return it will store the error code
613 *
614 * This function will return the number of blocks allocated as
615 * requested by the passed-in parameters.
616 */
617static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
618 ext4_lblk_t iblock, ext4_fsblk_t goal,
619 int indirect_blks, int blks,
620 ext4_fsblk_t new_blocks[4], int *err)
621{
622 struct ext4_allocation_request ar;
623 int target, i;
624 unsigned long count = 0, blk_allocated = 0;
625 int index = 0;
626 ext4_fsblk_t current_block = 0;
627 int ret = 0;
628
629 /*
630 * Here we try to allocate the requested multiple blocks at once,
631 * on a best-effort basis.
632 * To build a branch, we should allocate blocks for
633 * the indirect blocks(if not allocated yet), and at least
634 * the first direct block of this branch. That's the
635 * minimum number of blocks need to allocate(required)
636 */
637 /* first we try to allocate the indirect blocks */
638 target = indirect_blks;
639 while (target > 0) {
640 count = target;
641 /* allocating blocks for indirect blocks and direct blocks */
642 current_block = ext4_new_meta_blocks(handle, inode, goal,
643 0, &count, err);
644 if (*err)
645 goto failed_out;
646
647 if (unlikely(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS)) {
648 EXT4_ERROR_INODE(inode,
649 "current_block %llu + count %lu > %d!",
650 current_block, count,
651 EXT4_MAX_BLOCK_FILE_PHYS);
652 *err = -EIO;
653 goto failed_out;
654 }
655
656 target -= count;
657 /* allocate blocks for indirect blocks */
658 while (index < indirect_blks && count) {
659 new_blocks[index++] = current_block++;
660 count--;
661 }
662 if (count > 0) {
663 /*
664 * save the new block number
665 * for the first direct block
666 */
667 new_blocks[index] = current_block;
668 printk(KERN_INFO "%s returned more blocks than "
669 "requested\n", __func__);
670 WARN_ON(1);
671 break;
672 }
673 }
674
675 target = blks - count ;
676 blk_allocated = count;
677 if (!target)
678 goto allocated;
679 /* Now allocate data blocks */
680 memset(&ar, 0, sizeof(ar));
681 ar.inode = inode;
682 ar.goal = goal;
683 ar.len = target;
684 ar.logical = iblock;
685 if (S_ISREG(inode->i_mode))
686 /* enable in-core preallocation only for regular files */
687 ar.flags = EXT4_MB_HINT_DATA;
688
689 current_block = ext4_mb_new_blocks(handle, &ar, err);
690 if (unlikely(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS)) {
691 EXT4_ERROR_INODE(inode,
692 "current_block %llu + ar.len %d > %d!",
693 current_block, ar.len,
694 EXT4_MAX_BLOCK_FILE_PHYS);
695 *err = -EIO;
696 goto failed_out;
697 }
698
699 if (*err && (target == blks)) {
700 /*
701 * if the allocation failed and we didn't allocate
702 * any blocks before
703 */
704 goto failed_out;
705 }
706 if (!*err) {
707 if (target == blks) {
708 /*
709 * save the new block number
710 * for the first direct block
711 */
712 new_blocks[index] = current_block;
713 }
714 blk_allocated += ar.len;
715 }
716allocated:
717 /* total number of blocks allocated for direct blocks */
718 ret = blk_allocated;
719 *err = 0;
720 return ret;
721failed_out:
722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret;
725}
726
727/**
728 * ext4_alloc_branch - allocate and set up a chain of blocks.
729 * @handle: handle for this transaction
730 * @inode: owner
731 * @indirect_blks: number of allocated indirect blocks
732 * @blks: number of allocated direct blocks
733 * @goal: preferred place for allocation
734 * @offsets: offsets (in the blocks) to store the pointers to next.
735 * @branch: place to store the chain in.
736 *
737 * This function allocates blocks, zeroes out all but the last one,
738 * links them into chain and (if we are synchronous) writes them to disk.
739 * In other words, it prepares a branch that can be spliced onto the
740 * inode. It stores the information about that chain in the branch[], in
741 * the same format as ext4_get_branch() would do. We are calling it after
742 * we had read the existing part of chain and partial points to the last
743 * triple of that (one with zero ->key). Upon the exit we have the same
744 * picture as after the successful ext4_get_block(), except that in one
745 * place chain is disconnected - *branch->p is still zero (we did not
746 * set the last link), but branch->key contains the number that should
747 * be placed into *branch->p to fill that gap.
748 *
749 * If allocation fails we free all blocks we've allocated (and forget
750 * their buffer_heads) and return the error value the from failed
751 * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain
752 * as described above and return 0.
753 */
754static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
755 ext4_lblk_t iblock, int indirect_blks,
756 int *blks, ext4_fsblk_t goal,
757 ext4_lblk_t *offsets, Indirect *branch)
758{
759 int blocksize = inode->i_sb->s_blocksize;
760 int i, n = 0;
761 int err = 0;
762 struct buffer_head *bh;
763 int num;
764 ext4_fsblk_t new_blocks[4];
765 ext4_fsblk_t current_block;
766
767 num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
768 *blks, new_blocks, &err);
769 if (err)
770 return err;
771
772 branch[0].key = cpu_to_le32(new_blocks[0]);
773 /*
774 * metadata blocks and data blocks are allocated.
775 */
776 for (n = 1; n <= indirect_blks; n++) {
777 /*
778 * Get buffer_head for parent block, zero it out
779 * and set the pointer to new one, then send
780 * parent to disk.
781 */
782 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
783 if (unlikely(!bh)) {
784 err = -EIO;
785 goto failed;
786 }
787
788 branch[n].bh = bh;
789 lock_buffer(bh);
790 BUFFER_TRACE(bh, "call get_create_access");
791 err = ext4_journal_get_create_access(handle, bh);
792 if (err) {
793 /* Don't brelse(bh) here; it's done in
794 * ext4_journal_forget() below */
795 unlock_buffer(bh);
796 goto failed;
797 }
798
799 memset(bh->b_data, 0, blocksize);
800 branch[n].p = (__le32 *) bh->b_data + offsets[n];
801 branch[n].key = cpu_to_le32(new_blocks[n]);
802 *branch[n].p = branch[n].key;
803 if (n == indirect_blks) {
804 current_block = new_blocks[n];
805 /*
806 * End of chain, update the last new metablock of
807 * the chain to point to the new allocated
808 * data blocks numbers
809 */
810 for (i = 1; i < num; i++)
811 *(branch[n].p + i) = cpu_to_le32(++current_block);
812 }
813 BUFFER_TRACE(bh, "marking uptodate");
814 set_buffer_uptodate(bh);
815 unlock_buffer(bh);
816
817 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
818 err = ext4_handle_dirty_metadata(handle, inode, bh);
819 if (err)
820 goto failed;
821 }
822 *blks = num;
823 return err;
824failed:
825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) {
828 /*
829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */
833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET);
835 }
836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838
839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840
841 return err;
842}
843
844/**
845 * ext4_splice_branch - splice the allocated branch onto inode.
846 * @handle: handle for this transaction
847 * @inode: owner
848 * @block: (logical) number of block we are adding
849 * @chain: chain of indirect blocks (with a missing link - see
850 * ext4_alloc_branch)
851 * @where: location of missing link
852 * @num: number of indirect blocks we are adding
853 * @blks: number of direct blocks we are adding
854 *
855 * This function fills the missing link and does all housekeeping needed in
856 * inode (->i_blocks, etc.). In case of success we end up with the full
857 * chain to new block and return 0.
858 */
859static int ext4_splice_branch(handle_t *handle, struct inode *inode,
860 ext4_lblk_t block, Indirect *where, int num,
861 int blks)
862{
863 int i;
864 int err = 0;
865 ext4_fsblk_t current_block;
866
867 /*
868 * If we're splicing into a [td]indirect block (as opposed to the
869 * inode) then we need to get write access to the [td]indirect block
870 * before the splice.
871 */
872 if (where->bh) {
873 BUFFER_TRACE(where->bh, "get_write_access");
874 err = ext4_journal_get_write_access(handle, where->bh);
875 if (err)
876 goto err_out;
877 }
878 /* That's it */
879
880 *where->p = where->key;
881
882 /*
883 * Update the host buffer_head or inode to point to more just allocated
884 * direct blocks blocks
885 */
886 if (num == 0 && blks > 1) {
887 current_block = le32_to_cpu(where->key) + 1;
888 for (i = 1; i < blks; i++)
889 *(where->p + i) = cpu_to_le32(current_block++);
890 }
891
892 /* We are done with atomic stuff, now do the rest of housekeeping */
893 /* had we spliced it onto indirect block? */
894 if (where->bh) {
895 /*
896 * If we spliced it onto an indirect block, we haven't
897 * altered the inode. Note however that if it is being spliced
898 * onto an indirect block at the very end of the file (the
899 * file is growing) then we *will* alter the inode to reflect
900 * the new i_size. But that is not done here - it is done in
901 * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode.
902 */
903 jbd_debug(5, "splicing indirect only\n");
904 BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata");
905 err = ext4_handle_dirty_metadata(handle, inode, where->bh);
906 if (err)
907 goto err_out;
908 } else {
909 /*
910 * OK, we spliced it into the inode itself on a direct block.
911 */
912 ext4_mark_inode_dirty(handle, inode);
913 jbd_debug(5, "splicing direct\n");
914 }
915 return err;
916
917err_out:
918 for (i = 1; i <= num; i++) {
919 /*
920 * branch[i].bh is newly allocated, so there is no
921 * need to revoke the block, which is why we don't
922 * need to set EXT4_FREE_BLOCKS_METADATA.
923 */
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET);
926 }
927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0);
929
930 return err;
931}
932
933/*
934 * The ext4_ind_map_blocks() function handles non-extents inodes
935 * (i.e., using the traditional indirect/double-indirect i_blocks
936 * scheme) for ext4_map_blocks().
937 *
938 * Allocation strategy is simple: if we have to allocate something, we will
939 * have to go the whole way to leaf. So let's do it before attaching anything
940 * to tree, set linkage between the newborn blocks, write them if sync is
941 * required, recheck the path, free and repeat if check fails, otherwise
942 * set the last missing link (that will protect us from any truncate-generated
943 * removals - all blocks on the path are immune now) and possibly force the
944 * write on the parent block.
945 * That has a nice additional property: no special recovery from the failed
946 * allocations is needed - we simply release blocks and do not touch anything
947 * reachable from inode.
948 *
949 * `handle' can be NULL if create == 0.
950 *
951 * return > 0, # of blocks mapped or allocated.
952 * return = 0, if plain lookup failed.
953 * return < 0, error case.
954 *
955 * The ext4_ind_get_blocks() function should be called with
956 * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
957 * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
958 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
959 * blocks.
960 */
961static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
962 struct ext4_map_blocks *map,
963 int flags)
964{
965 int err = -EIO;
966 ext4_lblk_t offsets[4];
967 Indirect chain[4];
968 Indirect *partial;
969 ext4_fsblk_t goal;
970 int indirect_blks;
971 int blocks_to_boundary = 0;
972 int depth;
973 int count = 0;
974 ext4_fsblk_t first_block = 0;
975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
980 &blocks_to_boundary);
981
982 if (depth == 0)
983 goto out;
984
985 partial = ext4_get_branch(inode, depth, offsets, chain, &err);
986
987 /* Simplest case - block found, no allocation needed */
988 if (!partial) {
989 first_block = le32_to_cpu(chain[depth - 1].key);
990 count++;
991 /*map more blocks*/
992 while (count < map->m_len && count <= blocks_to_boundary) {
993 ext4_fsblk_t blk;
994
995 blk = le32_to_cpu(*(chain[depth-1].p + count));
996
997 if (blk == first_block + count)
998 count++;
999 else
1000 break;
1001 }
1002 goto got_it;
1003 }
1004
1005 /* Next simple case - plain lookup or failed read of indirect block */
1006 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
1007 goto cleanup;
1008
1009 /*
1010 * Okay, we need to do block allocation.
1011 */
1012 goal = ext4_find_goal(inode, map->m_lblk, partial);
1013
1014 /* the number of blocks need to allocate for [d,t]indirect blocks */
1015 indirect_blks = (chain + depth) - partial - 1;
1016
1017 /*
1018 * Next look up the indirect map to count the totoal number of
1019 * direct blocks to allocate for this branch.
1020 */
1021 count = ext4_blks_to_allocate(partial, indirect_blks,
1022 map->m_len, blocks_to_boundary);
1023 /*
1024 * Block out ext4_truncate while we alter the tree
1025 */
1026 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
1027 &count, goal,
1028 offsets + (partial - chain), partial);
1029
1030 /*
1031 * The ext4_splice_branch call will free and forget any buffers
1032 * on the new chain if there is a failure, but that risks using
1033 * up transaction credits, especially for bitmaps where the
1034 * credits cannot be returned. Can we handle this somehow? We
1035 * may need to return -EAGAIN upwards in the worst case. --sct
1036 */
1037 if (!err)
1038 err = ext4_splice_branch(handle, inode, map->m_lblk,
1039 partial, indirect_blks, count);
1040 if (err)
1041 goto cleanup;
1042
1043 map->m_flags |= EXT4_MAP_NEW;
1044
1045 ext4_update_inode_fsync_trans(handle, inode, 1);
1046got_it:
1047 map->m_flags |= EXT4_MAP_MAPPED;
1048 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1049 map->m_len = count;
1050 if (count > blocks_to_boundary)
1051 map->m_flags |= EXT4_MAP_BOUNDARY;
1052 err = count;
1053 /* Clean up and exit */
1054 partial = chain + depth - 1; /* the whole chain */
1055cleanup:
1056 while (partial > chain) {
1057 BUFFER_TRACE(partial->bh, "call brelse");
1058 brelse(partial->bh);
1059 partial--;
1060 }
1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1064 return err;
1065}
1066
1067#ifdef CONFIG_QUOTA 238#ifdef CONFIG_QUOTA
1068qsize_t *ext4_get_reserved_space(struct inode *inode) 239qsize_t *ext4_get_reserved_space(struct inode *inode)
1069{ 240{
@@ -1073,33 +244,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
1073 244
1074/* 245/*
1075 * Calculate the number of metadata blocks need to reserve 246 * Calculate the number of metadata blocks need to reserve
1076 * to allocate a new block at @lblocks for non extent file based file
1077 */
1078static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1079 sector_t lblock)
1080{
1081 struct ext4_inode_info *ei = EXT4_I(inode);
1082 sector_t dind_mask = ~((sector_t)EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1);
1083 int blk_bits;
1084
1085 if (lblock < EXT4_NDIR_BLOCKS)
1086 return 0;
1087
1088 lblock -= EXT4_NDIR_BLOCKS;
1089
1090 if (ei->i_da_metadata_calc_len &&
1091 (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
1092 ei->i_da_metadata_calc_len++;
1093 return 0;
1094 }
1095 ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
1096 ei->i_da_metadata_calc_len = 1;
1097 blk_bits = order_base_2(lblock);
1098 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
1099}
1100
1101/*
1102 * Calculate the number of metadata blocks need to reserve
1103 * to allocate a block located at @lblock 247 * to allocate a block located at @lblock
1104 */ 248 */
1105static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock) 249static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
@@ -1107,7 +251,7 @@ static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
1107 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 251 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1108 return ext4_ext_calc_metadata_amount(inode, lblock); 252 return ext4_ext_calc_metadata_amount(inode, lblock);
1109 253
1110 return ext4_indirect_calc_metadata_amount(inode, lblock); 254 return ext4_ind_calc_metadata_amount(inode, lblock);
1111} 255}
1112 256
1113/* 257/*
@@ -1589,16 +733,6 @@ static int do_journal_get_write_access(handle_t *handle,
1589 return ret; 733 return ret;
1590} 734}
1591 735
1592/*
1593 * Truncate blocks that were not used by write. We have to truncate the
1594 * pagecache as well so that corresponding buffers get properly unmapped.
1595 */
1596static void ext4_truncate_failed_write(struct inode *inode)
1597{
1598 truncate_inode_pages(inode->i_mapping, inode->i_size);
1599 ext4_truncate(inode);
1600}
1601
1602static int ext4_get_block_write(struct inode *inode, sector_t iblock, 736static int ext4_get_block_write(struct inode *inode, sector_t iblock,
1603 struct buffer_head *bh_result, int create); 737 struct buffer_head *bh_result, int create);
1604static int ext4_write_begin(struct file *file, struct address_space *mapping, 738static int ext4_write_begin(struct file *file, struct address_space *mapping,
@@ -1863,6 +997,7 @@ static int ext4_journalled_write_end(struct file *file,
1863 if (new_i_size > inode->i_size) 997 if (new_i_size > inode->i_size)
1864 i_size_write(inode, pos+copied); 998 i_size_write(inode, pos+copied);
1865 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 999 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1000 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1866 if (new_i_size > EXT4_I(inode)->i_disksize) { 1001 if (new_i_size > EXT4_I(inode)->i_disksize) {
1867 ext4_update_i_disksize(inode, new_i_size); 1002 ext4_update_i_disksize(inode, new_i_size);
1868 ret2 = ext4_mark_inode_dirty(handle, inode); 1003 ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2571,6 +1706,7 @@ static int __ext4_journalled_writepage(struct page *page,
2571 write_end_fn); 1706 write_end_fn);
2572 if (ret == 0) 1707 if (ret == 0)
2573 ret = err; 1708 ret = err;
1709 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
2574 err = ext4_journal_stop(handle); 1710 err = ext4_journal_stop(handle);
2575 if (!ret) 1711 if (!ret)
2576 ret = err; 1712 ret = err;
@@ -3450,112 +2586,6 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3450} 2586}
3451 2587
3452/* 2588/*
3453 * O_DIRECT for ext3 (or indirect map) based files
3454 *
3455 * If the O_DIRECT write will extend the file then add this inode to the
3456 * orphan list. So recovery will truncate it back to the original size
3457 * if the machine crashes during the write.
3458 *
3459 * If the O_DIRECT write is intantiating holes inside i_size and the machine
3460 * crashes then stale disk data _may_ be exposed inside the file. But current
3461 * VFS code falls back into buffered path in that case so we are safe.
3462 */
3463static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
3464 const struct iovec *iov, loff_t offset,
3465 unsigned long nr_segs)
3466{
3467 struct file *file = iocb->ki_filp;
3468 struct inode *inode = file->f_mapping->host;
3469 struct ext4_inode_info *ei = EXT4_I(inode);
3470 handle_t *handle;
3471 ssize_t ret;
3472 int orphan = 0;
3473 size_t count = iov_length(iov, nr_segs);
3474 int retries = 0;
3475
3476 if (rw == WRITE) {
3477 loff_t final_size = offset + count;
3478
3479 if (final_size > inode->i_size) {
3480 /* Credits for sb + inode write */
3481 handle = ext4_journal_start(inode, 2);
3482 if (IS_ERR(handle)) {
3483 ret = PTR_ERR(handle);
3484 goto out;
3485 }
3486 ret = ext4_orphan_add(handle, inode);
3487 if (ret) {
3488 ext4_journal_stop(handle);
3489 goto out;
3490 }
3491 orphan = 1;
3492 ei->i_disksize = inode->i_size;
3493 ext4_journal_stop(handle);
3494 }
3495 }
3496
3497retry:
3498 if (rw == READ && ext4_should_dioread_nolock(inode))
3499 ret = __blockdev_direct_IO(rw, iocb, inode,
3500 inode->i_sb->s_bdev, iov,
3501 offset, nr_segs,
3502 ext4_get_block, NULL, NULL, 0);
3503 else {
3504 ret = blockdev_direct_IO(rw, iocb, inode, iov,
3505 offset, nr_segs, ext4_get_block);
3506
3507 if (unlikely((rw & WRITE) && ret < 0)) {
3508 loff_t isize = i_size_read(inode);
3509 loff_t end = offset + iov_length(iov, nr_segs);
3510
3511 if (end > isize)
3512 ext4_truncate_failed_write(inode);
3513 }
3514 }
3515 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3516 goto retry;
3517
3518 if (orphan) {
3519 int err;
3520
3521 /* Credits for sb + inode write */
3522 handle = ext4_journal_start(inode, 2);
3523 if (IS_ERR(handle)) {
3524 /* This is really bad luck. We've written the data
3525 * but cannot extend i_size. Bail out and pretend
3526 * the write failed... */
3527 ret = PTR_ERR(handle);
3528 if (inode->i_nlink)
3529 ext4_orphan_del(NULL, inode);
3530
3531 goto out;
3532 }
3533 if (inode->i_nlink)
3534 ext4_orphan_del(handle, inode);
3535 if (ret > 0) {
3536 loff_t end = offset + ret;
3537 if (end > inode->i_size) {
3538 ei->i_disksize = end;
3539 i_size_write(inode, end);
3540 /*
3541 * We're going to return a positive `ret'
3542 * here due to non-zero-length I/O, so there's
3543 * no way of reporting error returns from
3544 * ext4_mark_inode_dirty() to userspace. So
3545 * ignore it.
3546 */
3547 ext4_mark_inode_dirty(handle, inode);
3548 }
3549 }
3550 err = ext4_journal_stop(handle);
3551 if (ret == 0)
3552 ret = err;
3553 }
3554out:
3555 return ret;
3556}
3557
3558/*
3559 * ext4_get_block used when preparing for a DIO write or buffer write. 2589 * ext4_get_block used when preparing for a DIO write or buffer write.
3560 * We allocate an uinitialized extent if blocks haven't been allocated. 2590 * We allocate an uinitialized extent if blocks haven't been allocated.
3561 * The extent will be converted to initialized after the IO is complete. 2591 * The extent will be converted to initialized after the IO is complete.
@@ -4033,383 +3063,6 @@ unlock:
4033 return err; 3063 return err;
4034} 3064}
4035 3065
4036/*
4037 * Probably it should be a library function... search for first non-zero word
4038 * or memcmp with zero_page, whatever is better for particular architecture.
4039 * Linus?
4040 */
4041static inline int all_zeroes(__le32 *p, __le32 *q)
4042{
4043 while (p < q)
4044 if (*p++)
4045 return 0;
4046 return 1;
4047}
4048
4049/**
4050 * ext4_find_shared - find the indirect blocks for partial truncation.
4051 * @inode: inode in question
4052 * @depth: depth of the affected branch
4053 * @offsets: offsets of pointers in that branch (see ext4_block_to_path)
4054 * @chain: place to store the pointers to partial indirect blocks
4055 * @top: place to the (detached) top of branch
4056 *
4057 * This is a helper function used by ext4_truncate().
4058 *
4059 * When we do truncate() we may have to clean the ends of several
4060 * indirect blocks but leave the blocks themselves alive. Block is
4061 * partially truncated if some data below the new i_size is referred
4062 * from it (and it is on the path to the first completely truncated
4063 * data block, indeed). We have to free the top of that path along
4064 * with everything to the right of the path. Since no allocation
4065 * past the truncation point is possible until ext4_truncate()
4066 * finishes, we may safely do the latter, but top of branch may
4067 * require special attention - pageout below the truncation point
4068 * might try to populate it.
4069 *
4070 * We atomically detach the top of branch from the tree, store the
4071 * block number of its root in *@top, pointers to buffer_heads of
4072 * partially truncated blocks - in @chain[].bh and pointers to
4073 * their last elements that should not be removed - in
4074 * @chain[].p. Return value is the pointer to last filled element
4075 * of @chain.
4076 *
4077 * The work left to caller to do the actual freeing of subtrees:
4078 * a) free the subtree starting from *@top
4079 * b) free the subtrees whose roots are stored in
4080 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
4081 * c) free the subtrees growing from the inode past the @chain[0].
4082 * (no partially truncated stuff there). */
4083
4084static Indirect *ext4_find_shared(struct inode *inode, int depth,
4085 ext4_lblk_t offsets[4], Indirect chain[4],
4086 __le32 *top)
4087{
4088 Indirect *partial, *p;
4089 int k, err;
4090
4091 *top = 0;
4092 /* Make k index the deepest non-null offset + 1 */
4093 for (k = depth; k > 1 && !offsets[k-1]; k--)
4094 ;
4095 partial = ext4_get_branch(inode, k, offsets, chain, &err);
4096 /* Writer: pointers */
4097 if (!partial)
4098 partial = chain + k-1;
4099 /*
4100 * If the branch acquired continuation since we've looked at it -
4101 * fine, it should all survive and (new) top doesn't belong to us.
4102 */
4103 if (!partial->key && *partial->p)
4104 /* Writer: end */
4105 goto no_top;
4106 for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
4107 ;
4108 /*
4109 * OK, we've found the last block that must survive. The rest of our
4110 * branch should be detached before unlocking. However, if that rest
4111 * of branch is all ours and does not grow immediately from the inode
4112 * it's easier to cheat and just decrement partial->p.
4113 */
4114 if (p == chain + k - 1 && p > chain) {
4115 p->p--;
4116 } else {
4117 *top = *p->p;
4118 /* Nope, don't do this in ext4. Must leave the tree intact */
4119#if 0
4120 *p->p = 0;
4121#endif
4122 }
4123 /* Writer: end */
4124
4125 while (partial > p) {
4126 brelse(partial->bh);
4127 partial--;
4128 }
4129no_top:
4130 return partial;
4131}
4132
4133/*
4134 * Zero a number of block pointers in either an inode or an indirect block.
4135 * If we restart the transaction we must again get write access to the
4136 * indirect block for further modification.
4137 *
4138 * We release `count' blocks on disk, but (last - first) may be greater
4139 * than `count' because there can be holes in there.
4140 *
4141 * Return 0 on success, 1 on invalid block range
4142 * and < 0 on fatal error.
4143 */
4144static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4145 struct buffer_head *bh,
4146 ext4_fsblk_t block_to_free,
4147 unsigned long count, __le32 *first,
4148 __le32 *last)
4149{
4150 __le32 *p;
4151 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED;
4152 int err;
4153
4154 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4155 flags |= EXT4_FREE_BLOCKS_METADATA;
4156
4157 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4158 count)) {
4159 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4160 "blocks %llu len %lu",
4161 (unsigned long long) block_to_free, count);
4162 return 1;
4163 }
4164
4165 if (try_to_extend_transaction(handle, inode)) {
4166 if (bh) {
4167 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4168 err = ext4_handle_dirty_metadata(handle, inode, bh);
4169 if (unlikely(err))
4170 goto out_err;
4171 }
4172 err = ext4_mark_inode_dirty(handle, inode);
4173 if (unlikely(err))
4174 goto out_err;
4175 err = ext4_truncate_restart_trans(handle, inode,
4176 blocks_for_truncate(inode));
4177 if (unlikely(err))
4178 goto out_err;
4179 if (bh) {
4180 BUFFER_TRACE(bh, "retaking write access");
4181 err = ext4_journal_get_write_access(handle, bh);
4182 if (unlikely(err))
4183 goto out_err;
4184 }
4185 }
4186
4187 for (p = first; p < last; p++)
4188 *p = 0;
4189
4190 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4191 return 0;
4192out_err:
4193 ext4_std_error(inode->i_sb, err);
4194 return err;
4195}
4196
4197/**
4198 * ext4_free_data - free a list of data blocks
4199 * @handle: handle for this transaction
4200 * @inode: inode we are dealing with
4201 * @this_bh: indirect buffer_head which contains *@first and *@last
4202 * @first: array of block numbers
4203 * @last: points immediately past the end of array
4204 *
4205 * We are freeing all blocks referred from that array (numbers are stored as
4206 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4207 *
4208 * We accumulate contiguous runs of blocks to free. Conveniently, if these
4209 * blocks are contiguous then releasing them at one time will only affect one
4210 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
4211 * actually use a lot of journal space.
4212 *
4213 * @this_bh will be %NULL if @first and @last point into the inode's direct
4214 * block pointers.
4215 */
4216static void ext4_free_data(handle_t *handle, struct inode *inode,
4217 struct buffer_head *this_bh,
4218 __le32 *first, __le32 *last)
4219{
4220 ext4_fsblk_t block_to_free = 0; /* Starting block # of a run */
4221 unsigned long count = 0; /* Number of blocks in the run */
4222 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
4223 corresponding to
4224 block_to_free */
4225 ext4_fsblk_t nr; /* Current block # */
4226 __le32 *p; /* Pointer into inode/ind
4227 for current block */
4228 int err = 0;
4229
4230 if (this_bh) { /* For indirect block */
4231 BUFFER_TRACE(this_bh, "get_write_access");
4232 err = ext4_journal_get_write_access(handle, this_bh);
4233 /* Important: if we can't update the indirect pointers
4234 * to the blocks, we can't free them. */
4235 if (err)
4236 return;
4237 }
4238
4239 for (p = first; p < last; p++) {
4240 nr = le32_to_cpu(*p);
4241 if (nr) {
4242 /* accumulate blocks to free if they're contiguous */
4243 if (count == 0) {
4244 block_to_free = nr;
4245 block_to_free_p = p;
4246 count = 1;
4247 } else if (nr == block_to_free + count) {
4248 count++;
4249 } else {
4250 err = ext4_clear_blocks(handle, inode, this_bh,
4251 block_to_free, count,
4252 block_to_free_p, p);
4253 if (err)
4254 break;
4255 block_to_free = nr;
4256 block_to_free_p = p;
4257 count = 1;
4258 }
4259 }
4260 }
4261
4262 if (!err && count > 0)
4263 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4264 count, block_to_free_p, p);
4265 if (err < 0)
4266 /* fatal error */
4267 return;
4268
4269 if (this_bh) {
4270 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
4271
4272 /*
4273 * The buffer head should have an attached journal head at this
4274 * point. However, if the data is corrupted and an indirect
4275 * block pointed to itself, it would have been detached when
4276 * the block was cleared. Check for this instead of OOPSing.
4277 */
4278 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4279 ext4_handle_dirty_metadata(handle, inode, this_bh);
4280 else
4281 EXT4_ERROR_INODE(inode,
4282 "circular indirect block detected at "
4283 "block %llu",
4284 (unsigned long long) this_bh->b_blocknr);
4285 }
4286}
4287
4288/**
4289 * ext4_free_branches - free an array of branches
4290 * @handle: JBD handle for this transaction
4291 * @inode: inode we are dealing with
4292 * @parent_bh: the buffer_head which contains *@first and *@last
4293 * @first: array of block numbers
4294 * @last: pointer immediately past the end of array
4295 * @depth: depth of the branches to free
4296 *
4297 * We are freeing all blocks referred from these branches (numbers are
4298 * stored as little-endian 32-bit) and updating @inode->i_blocks
4299 * appropriately.
4300 */
4301static void ext4_free_branches(handle_t *handle, struct inode *inode,
4302 struct buffer_head *parent_bh,
4303 __le32 *first, __le32 *last, int depth)
4304{
4305 ext4_fsblk_t nr;
4306 __le32 *p;
4307
4308 if (ext4_handle_is_aborted(handle))
4309 return;
4310
4311 if (depth--) {
4312 struct buffer_head *bh;
4313 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4314 p = last;
4315 while (--p >= first) {
4316 nr = le32_to_cpu(*p);
4317 if (!nr)
4318 continue; /* A hole */
4319
4320 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4321 nr, 1)) {
4322 EXT4_ERROR_INODE(inode,
4323 "invalid indirect mapped "
4324 "block %lu (level %d)",
4325 (unsigned long) nr, depth);
4326 break;
4327 }
4328
4329 /* Go read the buffer for the next level down */
4330 bh = sb_bread(inode->i_sb, nr);
4331
4332 /*
4333 * A read failure? Report error and clear slot
4334 * (should be rare).
4335 */
4336 if (!bh) {
4337 EXT4_ERROR_INODE_BLOCK(inode, nr,
4338 "Read failure");
4339 continue;
4340 }
4341
4342 /* This zaps the entire block. Bottom up. */
4343 BUFFER_TRACE(bh, "free child branches");
4344 ext4_free_branches(handle, inode, bh,
4345 (__le32 *) bh->b_data,
4346 (__le32 *) bh->b_data + addr_per_block,
4347 depth);
4348 brelse(bh);
4349
4350 /*
4351 * Everything below this this pointer has been
4352 * released. Now let this top-of-subtree go.
4353 *
4354 * We want the freeing of this indirect block to be
4355 * atomic in the journal with the updating of the
4356 * bitmap block which owns it. So make some room in
4357 * the journal.
4358 *
4359 * We zero the parent pointer *after* freeing its
4360 * pointee in the bitmaps, so if extend_transaction()
4361 * for some reason fails to put the bitmap changes and
4362 * the release into the same transaction, recovery
4363 * will merely complain about releasing a free block,
4364 * rather than leaking blocks.
4365 */
4366 if (ext4_handle_is_aborted(handle))
4367 return;
4368 if (try_to_extend_transaction(handle, inode)) {
4369 ext4_mark_inode_dirty(handle, inode);
4370 ext4_truncate_restart_trans(handle, inode,
4371 blocks_for_truncate(inode));
4372 }
4373
4374 /*
4375 * The forget flag here is critical because if
4376 * we are journaling (and not doing data
4377 * journaling), we have to make sure a revoke
4378 * record is written to prevent the journal
4379 * replay from overwriting the (former)
4380 * indirect block if it gets reallocated as a
4381 * data block. This must happen in the same
4382 * transaction where the data blocks are
4383 * actually freed.
4384 */
4385 ext4_free_blocks(handle, inode, NULL, nr, 1,
4386 EXT4_FREE_BLOCKS_METADATA|
4387 EXT4_FREE_BLOCKS_FORGET);
4388
4389 if (parent_bh) {
4390 /*
4391 * The block which we have just freed is
4392 * pointed to by an indirect block: journal it
4393 */
4394 BUFFER_TRACE(parent_bh, "get_write_access");
4395 if (!ext4_journal_get_write_access(handle,
4396 parent_bh)){
4397 *p = 0;
4398 BUFFER_TRACE(parent_bh,
4399 "call ext4_handle_dirty_metadata");
4400 ext4_handle_dirty_metadata(handle,
4401 inode,
4402 parent_bh);
4403 }
4404 }
4405 }
4406 } else {
4407 /* We have reached the bottom of the tree. */
4408 BUFFER_TRACE(parent_bh, "free data blocks");
4409 ext4_free_data(handle, inode, parent_bh, first, last);
4410 }
4411}
4412
4413int ext4_can_truncate(struct inode *inode) 3066int ext4_can_truncate(struct inode *inode)
4414{ 3067{
4415 if (S_ISREG(inode->i_mode)) 3068 if (S_ISREG(inode->i_mode))
@@ -4476,19 +3129,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
4476 */ 3129 */
4477void ext4_truncate(struct inode *inode) 3130void ext4_truncate(struct inode *inode)
4478{ 3131{
4479 handle_t *handle;
4480 struct ext4_inode_info *ei = EXT4_I(inode);
4481 __le32 *i_data = ei->i_data;
4482 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
4483 struct address_space *mapping = inode->i_mapping;
4484 ext4_lblk_t offsets[4];
4485 Indirect chain[4];
4486 Indirect *partial;
4487 __le32 nr = 0;
4488 int n = 0;
4489 ext4_lblk_t last_block, max_block;
4490 unsigned blocksize = inode->i_sb->s_blocksize;
4491
4492 trace_ext4_truncate_enter(inode); 3132 trace_ext4_truncate_enter(inode);
4493 3133
4494 if (!ext4_can_truncate(inode)) 3134 if (!ext4_can_truncate(inode))
@@ -4499,149 +3139,11 @@ void ext4_truncate(struct inode *inode)
4499 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 3139 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4500 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 3140 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4501 3141
4502 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 3142 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
4503 ext4_ext_truncate(inode); 3143 ext4_ext_truncate(inode);
4504 trace_ext4_truncate_exit(inode); 3144 else
4505 return; 3145 ext4_ind_truncate(inode);
4506 }
4507
4508 handle = start_transaction(inode);
4509 if (IS_ERR(handle))
4510 return; /* AKPM: return what? */
4511
4512 last_block = (inode->i_size + blocksize-1)
4513 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4514 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4515 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4516
4517 if (inode->i_size & (blocksize - 1))
4518 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4519 goto out_stop;
4520
4521 if (last_block != max_block) {
4522 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4523 if (n == 0)
4524 goto out_stop; /* error */
4525 }
4526
4527 /*
4528 * OK. This truncate is going to happen. We add the inode to the
4529 * orphan list, so that if this truncate spans multiple transactions,
4530 * and we crash, we will resume the truncate when the filesystem
4531 * recovers. It also marks the inode dirty, to catch the new size.
4532 *
4533 * Implication: the file must always be in a sane, consistent
4534 * truncatable state while each transaction commits.
4535 */
4536 if (ext4_orphan_add(handle, inode))
4537 goto out_stop;
4538
4539 /*
4540 * From here we block out all ext4_get_block() callers who want to
4541 * modify the block allocation tree.
4542 */
4543 down_write(&ei->i_data_sem);
4544
4545 ext4_discard_preallocations(inode);
4546
4547 /*
4548 * The orphan list entry will now protect us from any crash which
4549 * occurs before the truncate completes, so it is now safe to propagate
4550 * the new, shorter inode size (held for now in i_size) into the
4551 * on-disk inode. We do this via i_disksize, which is the value which
4552 * ext4 *really* writes onto the disk inode.
4553 */
4554 ei->i_disksize = inode->i_size;
4555
4556 if (last_block == max_block) {
4557 /*
4558 * It is unnecessary to free any data blocks if last_block is
4559 * equal to the indirect block limit.
4560 */
4561 goto out_unlock;
4562 } else if (n == 1) { /* direct blocks */
4563 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4564 i_data + EXT4_NDIR_BLOCKS);
4565 goto do_indirects;
4566 }
4567
4568 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
4569 /* Kill the top of shared branch (not detached) */
4570 if (nr) {
4571 if (partial == chain) {
4572 /* Shared branch grows from the inode */
4573 ext4_free_branches(handle, inode, NULL,
4574 &nr, &nr+1, (chain+n-1) - partial);
4575 *partial->p = 0;
4576 /*
4577 * We mark the inode dirty prior to restart,
4578 * and prior to stop. No need for it here.
4579 */
4580 } else {
4581 /* Shared branch grows from an indirect block */
4582 BUFFER_TRACE(partial->bh, "get_write_access");
4583 ext4_free_branches(handle, inode, partial->bh,
4584 partial->p,
4585 partial->p+1, (chain+n-1) - partial);
4586 }
4587 }
4588 /* Clear the ends of indirect blocks on the shared branch */
4589 while (partial > chain) {
4590 ext4_free_branches(handle, inode, partial->bh, partial->p + 1,
4591 (__le32*)partial->bh->b_data+addr_per_block,
4592 (chain+n-1) - partial);
4593 BUFFER_TRACE(partial->bh, "call brelse");
4594 brelse(partial->bh);
4595 partial--;
4596 }
4597do_indirects:
4598 /* Kill the remaining (whole) subtrees */
4599 switch (offsets[0]) {
4600 default:
4601 nr = i_data[EXT4_IND_BLOCK];
4602 if (nr) {
4603 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
4604 i_data[EXT4_IND_BLOCK] = 0;
4605 }
4606 case EXT4_IND_BLOCK:
4607 nr = i_data[EXT4_DIND_BLOCK];
4608 if (nr) {
4609 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
4610 i_data[EXT4_DIND_BLOCK] = 0;
4611 }
4612 case EXT4_DIND_BLOCK:
4613 nr = i_data[EXT4_TIND_BLOCK];
4614 if (nr) {
4615 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
4616 i_data[EXT4_TIND_BLOCK] = 0;
4617 }
4618 case EXT4_TIND_BLOCK:
4619 ;
4620 }
4621
4622out_unlock:
4623 up_write(&ei->i_data_sem);
4624 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4625 ext4_mark_inode_dirty(handle, inode);
4626
4627 /*
4628 * In a multi-transaction truncate, we only make the final transaction
4629 * synchronous
4630 */
4631 if (IS_SYNC(inode))
4632 ext4_handle_sync(handle);
4633out_stop:
4634 /*
4635 * If this was a simple ftruncate(), and the file will remain alive
4636 * then we need to clear up the orphan record which we created above.
4637 * However, if this was a real unlink then we were called by
4638 * ext4_delete_inode(), and we allow that function to clean up the
4639 * orphan info for us.
4640 */
4641 if (inode->i_nlink)
4642 ext4_orphan_del(handle, inode);
4643 3146
4644 ext4_journal_stop(handle);
4645 trace_ext4_truncate_exit(inode); 3147 trace_ext4_truncate_exit(inode);
4646} 3148}
4647 3149
@@ -5012,7 +3514,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5012 (S_ISLNK(inode->i_mode) && 3514 (S_ISLNK(inode->i_mode) &&
5013 !ext4_inode_is_fast_symlink(inode))) { 3515 !ext4_inode_is_fast_symlink(inode))) {
5014 /* Validate block references which are part of inode */ 3516 /* Validate block references which are part of inode */
5015 ret = ext4_check_inode_blockref(inode); 3517 ret = ext4_ind_check_inode(inode);
5016 } 3518 }
5017 if (ret) 3519 if (ret)
5018 goto bad_inode; 3520 goto bad_inode;
@@ -5459,34 +3961,10 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
5459 return 0; 3961 return 0;
5460} 3962}
5461 3963
5462static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5463 int chunk)
5464{
5465 int indirects;
5466
5467 /* if nrblocks are contiguous */
5468 if (chunk) {
5469 /*
5470 * With N contiguous data blocks, we need at most
5471 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5472 * 2 dindirect blocks, and 1 tindirect block
5473 */
5474 return DIV_ROUND_UP(nrblocks,
5475 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5476 }
5477 /*
5478 * if nrblocks are not contiguous, worse case, each block touch
5479 * a indirect block, and each indirect block touch a double indirect
5480 * block, plus a triple indirect block
5481 */
5482 indirects = nrblocks * 2 + 1;
5483 return indirects;
5484}
5485
5486static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 3964static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5487{ 3965{
5488 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3966 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5489 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 3967 return ext4_ind_trans_blocks(inode, nrblocks, chunk);
5490 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 3968 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5491} 3969}
5492 3970
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 808c554e773f..f18bfe37aff8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -202,8 +202,9 @@ setversion_out:
202 struct super_block *sb = inode->i_sb; 202 struct super_block *sb = inode->i_sb;
203 int err, err2=0; 203 int err, err2=0;
204 204
205 if (!capable(CAP_SYS_RESOURCE)) 205 err = ext4_resize_begin(sb);
206 return -EPERM; 206 if (err)
207 return err;
207 208
208 if (get_user(n_blocks_count, (__u32 __user *)arg)) 209 if (get_user(n_blocks_count, (__u32 __user *)arg))
209 return -EFAULT; 210 return -EFAULT;
@@ -221,6 +222,7 @@ setversion_out:
221 if (err == 0) 222 if (err == 0)
222 err = err2; 223 err = err2;
223 mnt_drop_write(filp->f_path.mnt); 224 mnt_drop_write(filp->f_path.mnt);
225 ext4_resize_end(sb);
224 226
225 return err; 227 return err;
226 } 228 }
@@ -271,8 +273,9 @@ mext_out:
271 struct super_block *sb = inode->i_sb; 273 struct super_block *sb = inode->i_sb;
272 int err, err2=0; 274 int err, err2=0;
273 275
274 if (!capable(CAP_SYS_RESOURCE)) 276 err = ext4_resize_begin(sb);
275 return -EPERM; 277 if (err)
278 return err;
276 279
277 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, 280 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
278 sizeof(input))) 281 sizeof(input)))
@@ -291,6 +294,7 @@ mext_out:
291 if (err == 0) 294 if (err == 0)
292 err = err2; 295 err = err2;
293 mnt_drop_write(filp->f_path.mnt); 296 mnt_drop_write(filp->f_path.mnt);
297 ext4_resize_end(sb);
294 298
295 return err; 299 return err;
296 } 300 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 6ed859d56850..17a5a57c415a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -75,8 +75,8 @@
75 * 75 *
76 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc 77 * block. If only the logical file block falls within the range of prealloc
78 * space we will consume the particular prealloc space. This make sure that 78 * space we will consume the particular prealloc space. This makes sure that
79 * that the we have contiguous physical blocks representing the file blocks 79 * we have contiguous physical blocks representing the file blocks
80 * 80 *
81 * The important thing to be noted in case of inode prealloc space is that 81 * The important thing to be noted in case of inode prealloc space is that
82 * we don't modify the values associated to inode prealloc space except 82 * we don't modify the values associated to inode prealloc space except
@@ -84,7 +84,7 @@
84 * 84 *
85 * If we are not able to find blocks in the inode prealloc space and if we 85 * If we are not able to find blocks in the inode prealloc space and if we
86 * have the group allocation flag set then we look at the locality group 86 * have the group allocation flag set then we look at the locality group
87 * prealloc space. These are per CPU prealloc list repreasented as 87 * prealloc space. These are per CPU prealloc list represented as
88 * 88 *
89 * ext4_sb_info.s_locality_groups[smp_processor_id()] 89 * ext4_sb_info.s_locality_groups[smp_processor_id()]
90 * 90 *
@@ -128,12 +128,13 @@
128 * we are doing a group prealloc we try to normalize the request to 128 * we are doing a group prealloc we try to normalize the request to
129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
130 * 512 blocks. This can be tuned via 130 * 512 blocks. This can be tuned via
131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in 131 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
132 * terms of number of blocks. If we have mounted the file system with -O 132 * terms of number of blocks. If we have mounted the file system with -O
133 * stripe=<value> option the group prealloc request is normalized to the 133 * stripe=<value> option the group prealloc request is normalized to the
134 * stripe value (sbi->s_stripe) 134 * the smallest multiple of the stripe value (sbi->s_stripe) which is
135 * greater than the default mb_group_prealloc.
135 * 136 *
136 * The regular allocator(using the buddy cache) supports few tunables. 137 * The regular allocator (using the buddy cache) supports a few tunables.
137 * 138 *
138 * /sys/fs/ext4/<partition>/mb_min_to_scan 139 * /sys/fs/ext4/<partition>/mb_min_to_scan
139 * /sys/fs/ext4/<partition>/mb_max_to_scan 140 * /sys/fs/ext4/<partition>/mb_max_to_scan
@@ -152,7 +153,7 @@
152 * best extent in the found extents. Searching for the blocks starts with 153 * best extent in the found extents. Searching for the blocks starts with
153 * the group specified as the goal value in allocation context via 154 * the group specified as the goal value in allocation context via
154 * ac_g_ex. Each group is first checked based on the criteria whether it 155 * ac_g_ex. Each group is first checked based on the criteria whether it
155 * can used for allocation. ext4_mb_good_group explains how the groups are 156 * can be used for allocation. ext4_mb_good_group explains how the groups are
156 * checked. 157 * checked.
157 * 158 *
158 * Both the prealloc space are getting populated as above. So for the first 159 * Both the prealloc space are getting populated as above. So for the first
@@ -492,10 +493,11 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
492 b2 = (unsigned char *) bitmap; 493 b2 = (unsigned char *) bitmap;
493 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 494 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
494 if (b1[i] != b2[i]) { 495 if (b1[i] != b2[i]) {
495 printk(KERN_ERR "corruption in group %u " 496 ext4_msg(e4b->bd_sb, KERN_ERR,
496 "at byte %u(%u): %x in copy != %x " 497 "corruption in group %u "
497 "on disk/prealloc\n", 498 "at byte %u(%u): %x in copy != %x "
498 e4b->bd_group, i, i * 8, b1[i], b2[i]); 499 "on disk/prealloc",
500 e4b->bd_group, i, i * 8, b1[i], b2[i]);
499 BUG(); 501 BUG();
500 } 502 }
501 } 503 }
@@ -1125,7 +1127,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1125 grp = ext4_get_group_info(sb, group); 1127 grp = ext4_get_group_info(sb, group);
1126 1128
1127 e4b->bd_blkbits = sb->s_blocksize_bits; 1129 e4b->bd_blkbits = sb->s_blocksize_bits;
1128 e4b->bd_info = ext4_get_group_info(sb, group); 1130 e4b->bd_info = grp;
1129 e4b->bd_sb = sb; 1131 e4b->bd_sb = sb;
1130 e4b->bd_group = group; 1132 e4b->bd_group = group;
1131 e4b->bd_buddy_page = NULL; 1133 e4b->bd_buddy_page = NULL;
@@ -1281,7 +1283,7 @@ static void mb_clear_bits(void *bm, int cur, int len)
1281 } 1283 }
1282} 1284}
1283 1285
1284static void mb_set_bits(void *bm, int cur, int len) 1286void ext4_set_bits(void *bm, int cur, int len)
1285{ 1287{
1286 __u32 *addr; 1288 __u32 *addr;
1287 1289
@@ -1510,7 +1512,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1510 } 1512 }
1511 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1513 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1512 1514
1513 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1515 ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1514 mb_check_buddy(e4b); 1516 mb_check_buddy(e4b);
1515 1517
1516 return ret; 1518 return ret;
@@ -2223,8 +2225,8 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2223 EXT4_DESC_PER_BLOCK_BITS(sb); 2225 EXT4_DESC_PER_BLOCK_BITS(sb);
2224 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2226 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2225 if (meta_group_info == NULL) { 2227 if (meta_group_info == NULL) {
2226 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2228 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
2227 "buddy group\n"); 2229 "for a buddy group");
2228 goto exit_meta_group_info; 2230 goto exit_meta_group_info;
2229 } 2231 }
2230 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2232 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
@@ -2237,7 +2239,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2237 2239
2238 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2240 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2239 if (meta_group_info[i] == NULL) { 2241 if (meta_group_info[i] == NULL) {
2240 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2242 ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
2241 goto exit_group_info; 2243 goto exit_group_info;
2242 } 2244 }
2243 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2245 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2279,8 +2281,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2279 2281
2280exit_group_info: 2282exit_group_info:
2281 /* If a meta_group_info table has been allocated, release it now */ 2283 /* If a meta_group_info table has been allocated, release it now */
2282 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2284 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2283 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2285 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2286 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2287 }
2284exit_meta_group_info: 2288exit_meta_group_info:
2285 return -ENOMEM; 2289 return -ENOMEM;
2286} /* ext4_mb_add_groupinfo */ 2290} /* ext4_mb_add_groupinfo */
@@ -2328,23 +2332,26 @@ static int ext4_mb_init_backend(struct super_block *sb)
2328 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2332 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2329 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2333 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2330 * So a two level scheme suffices for now. */ 2334 * So a two level scheme suffices for now. */
2331 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); 2335 sbi->s_group_info = ext4_kvzalloc(array_size, GFP_KERNEL);
2332 if (sbi->s_group_info == NULL) { 2336 if (sbi->s_group_info == NULL) {
2333 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2337 ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2334 return -ENOMEM; 2338 return -ENOMEM;
2335 } 2339 }
2336 sbi->s_buddy_cache = new_inode(sb); 2340 sbi->s_buddy_cache = new_inode(sb);
2337 if (sbi->s_buddy_cache == NULL) { 2341 if (sbi->s_buddy_cache == NULL) {
2338 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2342 ext4_msg(sb, KERN_ERR, "can't get new inode");
2339 goto err_freesgi; 2343 goto err_freesgi;
2340 } 2344 }
2341 sbi->s_buddy_cache->i_ino = get_next_ino(); 2345 /* To avoid potentially colliding with an valid on-disk inode number,
2346 * use EXT4_BAD_INO for the buddy cache inode number. This inode is
2347 * not in the inode hash, so it should never be found by iget(), but
2348 * this will avoid confusion if it ever shows up during debugging. */
2349 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2342 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2350 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2343 for (i = 0; i < ngroups; i++) { 2351 for (i = 0; i < ngroups; i++) {
2344 desc = ext4_get_group_desc(sb, i, NULL); 2352 desc = ext4_get_group_desc(sb, i, NULL);
2345 if (desc == NULL) { 2353 if (desc == NULL) {
2346 printk(KERN_ERR 2354 ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2347 "EXT4-fs: can't read descriptor %u\n", i);
2348 goto err_freebuddy; 2355 goto err_freebuddy;
2349 } 2356 }
2350 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2357 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
@@ -2362,7 +2369,7 @@ err_freebuddy:
2362 kfree(sbi->s_group_info[i]); 2369 kfree(sbi->s_group_info[i]);
2363 iput(sbi->s_buddy_cache); 2370 iput(sbi->s_buddy_cache);
2364err_freesgi: 2371err_freesgi:
2365 kfree(sbi->s_group_info); 2372 ext4_kvfree(sbi->s_group_info);
2366 return -ENOMEM; 2373 return -ENOMEM;
2367} 2374}
2368 2375
@@ -2404,14 +2411,15 @@ static int ext4_groupinfo_create_slab(size_t size)
2404 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 2411 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2405 NULL); 2412 NULL);
2406 2413
2414 ext4_groupinfo_caches[cache_index] = cachep;
2415
2407 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2416 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2408 if (!cachep) { 2417 if (!cachep) {
2409 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); 2418 printk(KERN_EMERG
2419 "EXT4-fs: no memory for groupinfo slab cache\n");
2410 return -ENOMEM; 2420 return -ENOMEM;
2411 } 2421 }
2412 2422
2413 ext4_groupinfo_caches[cache_index] = cachep;
2414
2415 return 0; 2423 return 0;
2416} 2424}
2417 2425
@@ -2457,12 +2465,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2457 i++; 2465 i++;
2458 } while (i <= sb->s_blocksize_bits + 1); 2466 } while (i <= sb->s_blocksize_bits + 1);
2459 2467
2460 /* init file for buddy data */
2461 ret = ext4_mb_init_backend(sb);
2462 if (ret != 0) {
2463 goto out;
2464 }
2465
2466 spin_lock_init(&sbi->s_md_lock); 2468 spin_lock_init(&sbi->s_md_lock);
2467 spin_lock_init(&sbi->s_bal_lock); 2469 spin_lock_init(&sbi->s_bal_lock);
2468 2470
@@ -2472,6 +2474,18 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2472 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2474 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2473 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2475 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2474 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2476 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2477 /*
2478 * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2479 * to the lowest multiple of s_stripe which is bigger than
2480 * the s_mb_group_prealloc as determined above. We want
2481 * the preallocation size to be an exact multiple of the
2482 * RAID stripe size so that preallocations don't fragment
2483 * the stripes.
2484 */
2485 if (sbi->s_stripe > 1) {
2486 sbi->s_mb_group_prealloc = roundup(
2487 sbi->s_mb_group_prealloc, sbi->s_stripe);
2488 }
2475 2489
2476 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2490 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2477 if (sbi->s_locality_groups == NULL) { 2491 if (sbi->s_locality_groups == NULL) {
@@ -2487,6 +2501,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2487 spin_lock_init(&lg->lg_prealloc_lock); 2501 spin_lock_init(&lg->lg_prealloc_lock);
2488 } 2502 }
2489 2503
2504 /* init file for buddy data */
2505 ret = ext4_mb_init_backend(sb);
2506 if (ret != 0) {
2507 goto out;
2508 }
2509
2490 if (sbi->s_proc) 2510 if (sbi->s_proc)
2491 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2511 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2492 &ext4_mb_seq_groups_fops, sb); 2512 &ext4_mb_seq_groups_fops, sb);
@@ -2544,32 +2564,32 @@ int ext4_mb_release(struct super_block *sb)
2544 EXT4_DESC_PER_BLOCK_BITS(sb); 2564 EXT4_DESC_PER_BLOCK_BITS(sb);
2545 for (i = 0; i < num_meta_group_infos; i++) 2565 for (i = 0; i < num_meta_group_infos; i++)
2546 kfree(sbi->s_group_info[i]); 2566 kfree(sbi->s_group_info[i]);
2547 kfree(sbi->s_group_info); 2567 ext4_kvfree(sbi->s_group_info);
2548 } 2568 }
2549 kfree(sbi->s_mb_offsets); 2569 kfree(sbi->s_mb_offsets);
2550 kfree(sbi->s_mb_maxs); 2570 kfree(sbi->s_mb_maxs);
2551 if (sbi->s_buddy_cache) 2571 if (sbi->s_buddy_cache)
2552 iput(sbi->s_buddy_cache); 2572 iput(sbi->s_buddy_cache);
2553 if (sbi->s_mb_stats) { 2573 if (sbi->s_mb_stats) {
2554 printk(KERN_INFO 2574 ext4_msg(sb, KERN_INFO,
2555 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2575 "mballoc: %u blocks %u reqs (%u success)",
2556 atomic_read(&sbi->s_bal_allocated), 2576 atomic_read(&sbi->s_bal_allocated),
2557 atomic_read(&sbi->s_bal_reqs), 2577 atomic_read(&sbi->s_bal_reqs),
2558 atomic_read(&sbi->s_bal_success)); 2578 atomic_read(&sbi->s_bal_success));
2559 printk(KERN_INFO 2579 ext4_msg(sb, KERN_INFO,
2560 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2580 "mballoc: %u extents scanned, %u goal hits, "
2561 "%u 2^N hits, %u breaks, %u lost\n", 2581 "%u 2^N hits, %u breaks, %u lost",
2562 atomic_read(&sbi->s_bal_ex_scanned), 2582 atomic_read(&sbi->s_bal_ex_scanned),
2563 atomic_read(&sbi->s_bal_goals), 2583 atomic_read(&sbi->s_bal_goals),
2564 atomic_read(&sbi->s_bal_2orders), 2584 atomic_read(&sbi->s_bal_2orders),
2565 atomic_read(&sbi->s_bal_breaks), 2585 atomic_read(&sbi->s_bal_breaks),
2566 atomic_read(&sbi->s_mb_lost_chunks)); 2586 atomic_read(&sbi->s_mb_lost_chunks));
2567 printk(KERN_INFO 2587 ext4_msg(sb, KERN_INFO,
2568 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2588 "mballoc: %lu generated and it took %Lu",
2569 sbi->s_mb_buddies_generated++, 2589 sbi->s_mb_buddies_generated,
2570 sbi->s_mb_generation_time); 2590 sbi->s_mb_generation_time);
2571 printk(KERN_INFO 2591 ext4_msg(sb, KERN_INFO,
2572 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2592 "mballoc: %u preallocated, %u discarded",
2573 atomic_read(&sbi->s_mb_preallocated), 2593 atomic_read(&sbi->s_mb_preallocated),
2574 atomic_read(&sbi->s_mb_discarded)); 2594 atomic_read(&sbi->s_mb_discarded));
2575 } 2595 }
@@ -2628,6 +2648,15 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2628 rb_erase(&entry->node, &(db->bb_free_root)); 2648 rb_erase(&entry->node, &(db->bb_free_root));
2629 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2649 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2630 2650
2651 /*
2652 * Clear the trimmed flag for the group so that the next
2653 * ext4_trim_fs can trim it.
2654 * If the volume is mounted with -o discard, online discard
2655 * is supported and the free blocks will be trimmed online.
2656 */
2657 if (!test_opt(sb, DISCARD))
2658 EXT4_MB_GRP_CLEAR_TRIMMED(db);
2659
2631 if (!db->bb_free_root.rb_node) { 2660 if (!db->bb_free_root.rb_node) {
2632 /* No more items in the per group rb tree 2661 /* No more items in the per group rb tree
2633 * balance refcounts from ext4_mb_free_metadata() 2662 * balance refcounts from ext4_mb_free_metadata()
@@ -2771,8 +2800,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2771 * We leak some of the blocks here. 2800 * We leak some of the blocks here.
2772 */ 2801 */
2773 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2802 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2774 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2803 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2775 ac->ac_b_ex.fe_len); 2804 ac->ac_b_ex.fe_len);
2776 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2805 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2777 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2806 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2778 if (!err) 2807 if (!err)
@@ -2790,7 +2819,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2790 } 2819 }
2791 } 2820 }
2792#endif 2821#endif
2793 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); 2822 ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2823 ac->ac_b_ex.fe_len);
2794 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2824 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2795 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2825 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2796 ext4_free_blks_set(sb, gdp, 2826 ext4_free_blks_set(sb, gdp,
@@ -2830,8 +2860,9 @@ out_err:
2830 2860
2831/* 2861/*
2832 * here we normalize request for locality group 2862 * here we normalize request for locality group
2833 * Group request are normalized to s_strip size if we set the same via mount 2863 * Group request are normalized to s_mb_group_prealloc, which goes to
2834 * option. If not we set it to s_mb_group_prealloc which can be configured via 2864 * s_strip if we set the same via mount option.
2865 * s_mb_group_prealloc can be configured via
2835 * /sys/fs/ext4/<partition>/mb_group_prealloc 2866 * /sys/fs/ext4/<partition>/mb_group_prealloc
2836 * 2867 *
2837 * XXX: should we try to preallocate more than the group has now? 2868 * XXX: should we try to preallocate more than the group has now?
@@ -2842,10 +2873,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2842 struct ext4_locality_group *lg = ac->ac_lg; 2873 struct ext4_locality_group *lg = ac->ac_lg;
2843 2874
2844 BUG_ON(lg == NULL); 2875 BUG_ON(lg == NULL);
2845 if (EXT4_SB(sb)->s_stripe) 2876 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2846 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
2847 else
2848 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2849 mb_debug(1, "#%u: goal %u blocks for locality group\n", 2877 mb_debug(1, "#%u: goal %u blocks for locality group\n",
2850 current->pid, ac->ac_g_ex.fe_len); 2878 current->pid, ac->ac_g_ex.fe_len);
2851} 2879}
@@ -3001,9 +3029,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3001 3029
3002 if (start + size <= ac->ac_o_ex.fe_logical && 3030 if (start + size <= ac->ac_o_ex.fe_logical &&
3003 start > ac->ac_o_ex.fe_logical) { 3031 start > ac->ac_o_ex.fe_logical) {
3004 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3032 ext4_msg(ac->ac_sb, KERN_ERR,
3005 (unsigned long) start, (unsigned long) size, 3033 "start %lu, size %lu, fe_logical %lu",
3006 (unsigned long) ac->ac_o_ex.fe_logical); 3034 (unsigned long) start, (unsigned long) size,
3035 (unsigned long) ac->ac_o_ex.fe_logical);
3007 } 3036 }
3008 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3037 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3009 start > ac->ac_o_ex.fe_logical); 3038 start > ac->ac_o_ex.fe_logical);
@@ -3262,7 +3291,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3262 3291
3263 while (n) { 3292 while (n) {
3264 entry = rb_entry(n, struct ext4_free_data, node); 3293 entry = rb_entry(n, struct ext4_free_data, node);
3265 mb_set_bits(bitmap, entry->start_blk, entry->count); 3294 ext4_set_bits(bitmap, entry->start_blk, entry->count);
3266 n = rb_next(n); 3295 n = rb_next(n);
3267 } 3296 }
3268 return; 3297 return;
@@ -3304,7 +3333,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3304 if (unlikely(len == 0)) 3333 if (unlikely(len == 0))
3305 continue; 3334 continue;
3306 BUG_ON(groupnr != group); 3335 BUG_ON(groupnr != group);
3307 mb_set_bits(bitmap, start, len); 3336 ext4_set_bits(bitmap, start, len);
3308 preallocated += len; 3337 preallocated += len;
3309 count++; 3338 count++;
3310 } 3339 }
@@ -3584,10 +3613,11 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3584 bit = next + 1; 3613 bit = next + 1;
3585 } 3614 }
3586 if (free != pa->pa_free) { 3615 if (free != pa->pa_free) {
3587 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3616 ext4_msg(e4b->bd_sb, KERN_CRIT,
3588 pa, (unsigned long) pa->pa_lstart, 3617 "pa %p: logic %lu, phys. %lu, len %lu",
3589 (unsigned long) pa->pa_pstart, 3618 pa, (unsigned long) pa->pa_lstart,
3590 (unsigned long) pa->pa_len); 3619 (unsigned long) pa->pa_pstart,
3620 (unsigned long) pa->pa_len);
3591 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 3621 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3592 free, pa->pa_free); 3622 free, pa->pa_free);
3593 /* 3623 /*
@@ -3775,7 +3805,8 @@ repeat:
3775 * use preallocation while we're discarding it */ 3805 * use preallocation while we're discarding it */
3776 spin_unlock(&pa->pa_lock); 3806 spin_unlock(&pa->pa_lock);
3777 spin_unlock(&ei->i_prealloc_lock); 3807 spin_unlock(&ei->i_prealloc_lock);
3778 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 3808 ext4_msg(sb, KERN_ERR,
3809 "uh-oh! used pa while discarding");
3779 WARN_ON(1); 3810 WARN_ON(1);
3780 schedule_timeout_uninterruptible(HZ); 3811 schedule_timeout_uninterruptible(HZ);
3781 goto repeat; 3812 goto repeat;
@@ -3852,12 +3883,13 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3852 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3883 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3853 return; 3884 return;
3854 3885
3855 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3886 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
3856 " Allocation context details:\n"); 3887 " Allocation context details:");
3857 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3888 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
3858 ac->ac_status, ac->ac_flags); 3889 ac->ac_status, ac->ac_flags);
3859 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 3890 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
3860 "best %lu/%lu/%lu@%lu cr %d\n", 3891 "goal %lu/%lu/%lu@%lu, "
3892 "best %lu/%lu/%lu@%lu cr %d",
3861 (unsigned long)ac->ac_o_ex.fe_group, 3893 (unsigned long)ac->ac_o_ex.fe_group,
3862 (unsigned long)ac->ac_o_ex.fe_start, 3894 (unsigned long)ac->ac_o_ex.fe_start,
3863 (unsigned long)ac->ac_o_ex.fe_len, 3895 (unsigned long)ac->ac_o_ex.fe_len,
@@ -3871,9 +3903,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3871 (unsigned long)ac->ac_b_ex.fe_len, 3903 (unsigned long)ac->ac_b_ex.fe_len,
3872 (unsigned long)ac->ac_b_ex.fe_logical, 3904 (unsigned long)ac->ac_b_ex.fe_logical,
3873 (int)ac->ac_criteria); 3905 (int)ac->ac_criteria);
3874 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 3906 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
3875 ac->ac_found); 3907 ac->ac_ex_scanned, ac->ac_found);
3876 printk(KERN_ERR "EXT4-fs: groups: \n"); 3908 ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
3877 ngroups = ext4_get_groups_count(sb); 3909 ngroups = ext4_get_groups_count(sb);
3878 for (i = 0; i < ngroups; i++) { 3910 for (i = 0; i < ngroups; i++) {
3879 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3911 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4637,7 +4669,7 @@ do_more:
4637 } 4669 }
4638 ext4_mark_super_dirty(sb); 4670 ext4_mark_super_dirty(sb);
4639error_return: 4671error_return:
4640 if (freed) 4672 if (freed && !(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4641 dquot_free_block(inode, freed); 4673 dquot_free_block(inode, freed);
4642 brelse(bitmap_bh); 4674 brelse(bitmap_bh);
4643 ext4_std_error(sb, err); 4675 ext4_std_error(sb, err);
@@ -4645,7 +4677,7 @@ error_return:
4645} 4677}
4646 4678
4647/** 4679/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group 4680 * ext4_group_add_blocks() -- Add given blocks to an existing group
4649 * @handle: handle to this transaction 4681 * @handle: handle to this transaction
4650 * @sb: super block 4682 * @sb: super block
4651 * @block: start physcial block to add to the block group 4683 * @block: start physcial block to add to the block group
@@ -4653,7 +4685,7 @@ error_return:
4653 * 4685 *
4654 * This marks the blocks as free in the bitmap and buddy. 4686 * This marks the blocks as free in the bitmap and buddy.
4655 */ 4687 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 4688int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4657 ext4_fsblk_t block, unsigned long count) 4689 ext4_fsblk_t block, unsigned long count)
4658{ 4690{
4659 struct buffer_head *bitmap_bh = NULL; 4691 struct buffer_head *bitmap_bh = NULL;
@@ -4666,25 +4698,35 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4666 struct ext4_buddy e4b; 4698 struct ext4_buddy e4b;
4667 int err = 0, ret, blk_free_count; 4699 int err = 0, ret, blk_free_count;
4668 ext4_grpblk_t blocks_freed; 4700 ext4_grpblk_t blocks_freed;
4669 struct ext4_group_info *grp;
4670 4701
4671 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 4702 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672 4703
4704 if (count == 0)
4705 return 0;
4706
4673 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4707 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674 grp = ext4_get_group_info(sb, block_group);
4675 /* 4708 /*
4676 * Check to see if we are freeing blocks across a group 4709 * Check to see if we are freeing blocks across a group
4677 * boundary. 4710 * boundary.
4678 */ 4711 */
4679 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) 4712 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4713 ext4_warning(sb, "too much blocks added to group %u\n",
4714 block_group);
4715 err = -EINVAL;
4680 goto error_return; 4716 goto error_return;
4717 }
4681 4718
4682 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4719 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683 if (!bitmap_bh) 4720 if (!bitmap_bh) {
4721 err = -EIO;
4684 goto error_return; 4722 goto error_return;
4723 }
4724
4685 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 4725 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686 if (!desc) 4726 if (!desc) {
4727 err = -EIO;
4687 goto error_return; 4728 goto error_return;
4729 }
4688 4730
4689 if (in_range(ext4_block_bitmap(sb, desc), block, count) || 4731 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690 in_range(ext4_inode_bitmap(sb, desc), block, count) || 4732 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
@@ -4694,6 +4736,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4694 ext4_error(sb, "Adding blocks in system zones - " 4736 ext4_error(sb, "Adding blocks in system zones - "
4695 "Block = %llu, count = %lu", 4737 "Block = %llu, count = %lu",
4696 block, count); 4738 block, count);
4739 err = -EINVAL;
4697 goto error_return; 4740 goto error_return;
4698 } 4741 }
4699 4742
@@ -4762,7 +4805,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4762error_return: 4805error_return:
4763 brelse(bitmap_bh); 4806 brelse(bitmap_bh);
4764 ext4_std_error(sb, err); 4807 ext4_std_error(sb, err);
4765 return; 4808 return err;
4766} 4809}
4767 4810
4768/** 4811/**
@@ -4782,6 +4825,8 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4782{ 4825{
4783 struct ext4_free_extent ex; 4826 struct ext4_free_extent ex;
4784 4827
4828 trace_ext4_trim_extent(sb, group, start, count);
4829
4785 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4830 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4786 4831
4787 ex.fe_start = start; 4832 ex.fe_start = start;
@@ -4802,7 +4847,7 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
4802/** 4847/**
4803 * ext4_trim_all_free -- function to trim all free space in alloc. group 4848 * ext4_trim_all_free -- function to trim all free space in alloc. group
4804 * @sb: super block for file system 4849 * @sb: super block for file system
4805 * @e4b: ext4 buddy 4850 * @group: group to be trimmed
4806 * @start: first group block to examine 4851 * @start: first group block to examine
4807 * @max: last group block to examine 4852 * @max: last group block to examine
4808 * @minblocks: minimum extent block count 4853 * @minblocks: minimum extent block count
@@ -4823,10 +4868,12 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4823 ext4_grpblk_t minblocks) 4868 ext4_grpblk_t minblocks)
4824{ 4869{
4825 void *bitmap; 4870 void *bitmap;
4826 ext4_grpblk_t next, count = 0; 4871 ext4_grpblk_t next, count = 0, free_count = 0;
4827 struct ext4_buddy e4b; 4872 struct ext4_buddy e4b;
4828 int ret; 4873 int ret;
4829 4874
4875 trace_ext4_trim_all_free(sb, group, start, max);
4876
4830 ret = ext4_mb_load_buddy(sb, group, &e4b); 4877 ret = ext4_mb_load_buddy(sb, group, &e4b);
4831 if (ret) { 4878 if (ret) {
4832 ext4_error(sb, "Error in loading buddy " 4879 ext4_error(sb, "Error in loading buddy "
@@ -4836,6 +4883,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4836 bitmap = e4b.bd_bitmap; 4883 bitmap = e4b.bd_bitmap;
4837 4884
4838 ext4_lock_group(sb, group); 4885 ext4_lock_group(sb, group);
4886 if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
4887 minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
4888 goto out;
4889
4839 start = (e4b.bd_info->bb_first_free > start) ? 4890 start = (e4b.bd_info->bb_first_free > start) ?
4840 e4b.bd_info->bb_first_free : start; 4891 e4b.bd_info->bb_first_free : start;
4841 4892
@@ -4850,6 +4901,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4850 next - start, group, &e4b); 4901 next - start, group, &e4b);
4851 count += next - start; 4902 count += next - start;
4852 } 4903 }
4904 free_count += next - start;
4853 start = next + 1; 4905 start = next + 1;
4854 4906
4855 if (fatal_signal_pending(current)) { 4907 if (fatal_signal_pending(current)) {
@@ -4863,9 +4915,13 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4863 ext4_lock_group(sb, group); 4915 ext4_lock_group(sb, group);
4864 } 4916 }
4865 4917
4866 if ((e4b.bd_info->bb_free - count) < minblocks) 4918 if ((e4b.bd_info->bb_free - free_count) < minblocks)
4867 break; 4919 break;
4868 } 4920 }
4921
4922 if (!ret)
4923 EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
4924out:
4869 ext4_unlock_group(sb, group); 4925 ext4_unlock_group(sb, group);
4870 ext4_mb_unload_buddy(&e4b); 4926 ext4_mb_unload_buddy(&e4b);
4871 4927
@@ -4904,6 +4960,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4904 4960
4905 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4961 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4906 return -EINVAL; 4962 return -EINVAL;
4963 if (start + len <= first_data_blk)
4964 goto out;
4907 if (start < first_data_blk) { 4965 if (start < first_data_blk) {
4908 len -= first_data_blk - start; 4966 len -= first_data_blk - start;
4909 start = first_data_blk; 4967 start = first_data_blk;
@@ -4952,5 +5010,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4952 } 5010 }
4953 range->len = trimmed * sb->s_blocksize; 5011 range->len = trimmed * sb->s_blocksize;
4954 5012
5013 if (!ret)
5014 atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5015
5016out:
4955 return ret; 5017 return ret;
4956} 5018}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 20b5e7bfebd1..9d4a636b546c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -187,7 +187,6 @@ struct ext4_allocation_context {
187 __u16 ac_flags; /* allocation hints */ 187 __u16 ac_flags; /* allocation hints */
188 __u8 ac_status; 188 __u8 ac_status;
189 __u8 ac_criteria; 189 __u8 ac_criteria;
190 __u8 ac_repeats;
191 __u8 ac_2order; /* if request is to allocate 2^N blocks and 190 __u8 ac_2order; /* if request is to allocate 2^N blocks and
192 * N > 0, the field stores N, otherwise 0 */ 191 * N > 0, the field stores N, otherwise 0 */
193 __u8 ac_op; /* operation, for history only */ 192 __u8 ac_op; /* operation, for history only */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8c9babac43dc..565a154e22d4 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -289,7 +289,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
289 while (len--) printk("%c", *name++); 289 while (len--) printk("%c", *name++);
290 ext4fs_dirhash(de->name, de->name_len, &h); 290 ext4fs_dirhash(de->name, de->name_len, &h);
291 printk(":%x.%u ", h.hash, 291 printk(":%x.%u ", h.hash,
292 ((char *) de - base)); 292 (unsigned) ((char *) de - base));
293 } 293 }
294 space += EXT4_DIR_REC_LEN(de->name_len); 294 space += EXT4_DIR_REC_LEN(de->name_len);
295 names++; 295 names++;
@@ -1013,7 +1013,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1013 1013
1014 *err = -ENOENT; 1014 *err = -ENOENT;
1015errout: 1015errout:
1016 dxtrace(printk(KERN_DEBUG "%s not found\n", name)); 1016 dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name));
1017 dx_release (frames); 1017 dx_release (frames);
1018 return NULL; 1018 return NULL;
1019} 1019}
@@ -1985,18 +1985,11 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
1985 if (!list_empty(&EXT4_I(inode)->i_orphan)) 1985 if (!list_empty(&EXT4_I(inode)->i_orphan))
1986 goto out_unlock; 1986 goto out_unlock;
1987 1987
1988 /* Orphan handling is only valid for files with data blocks 1988 /*
1989 * being truncated, or files being unlinked. */ 1989 * Orphan handling is only valid for files with data blocks
1990 1990 * being truncated, or files being unlinked. Note that we either
1991 /* @@@ FIXME: Observation from aviro: 1991 * hold i_mutex, or the inode can not be referenced from outside,
1992 * I think I can trigger J_ASSERT in ext4_orphan_add(). We block 1992 * so i_nlink should not be bumped due to race
1993 * here (on s_orphan_lock), so race with ext4_link() which might bump
1994 * ->i_nlink. For, say it, character device. Not a regular file,
1995 * not a directory, not a symlink and ->i_nlink > 0.
1996 *
1997 * tytso, 4/25/2009: I'm not sure how that could happen;
1998 * shouldn't the fs core protect us from these sort of
1999 * unlink()/link() races?
2000 */ 1993 */
2001 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1994 J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2002 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); 1995 S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7bb8f76d470a..430c401d0895 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -285,11 +285,7 @@ static int io_submit_init(struct ext4_io_submit *io,
285 io_end = ext4_init_io_end(inode, GFP_NOFS); 285 io_end = ext4_init_io_end(inode, GFP_NOFS);
286 if (!io_end) 286 if (!io_end)
287 return -ENOMEM; 287 return -ENOMEM;
288 do { 288 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
289 bio = bio_alloc(GFP_NOIO, nvecs);
290 nvecs >>= 1;
291 } while (bio == NULL);
292
293 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 289 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
294 bio->bi_bdev = bh->b_bdev; 290 bio->bi_bdev = bh->b_bdev;
295 bio->bi_private = io->io_end = io_end; 291 bio->bi_private = io->io_end = io_end;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 80bbc9c60c24..707d3f16f7ce 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -16,6 +16,35 @@
16 16
17#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
18 18
19int ext4_resize_begin(struct super_block *sb)
20{
21 int ret = 0;
22
23 if (!capable(CAP_SYS_RESOURCE))
24 return -EPERM;
25
26 /*
27 * We are not allowed to do online-resizing on a filesystem mounted
28 * with error, because it can destroy the filesystem easily.
29 */
30 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
31 ext4_warning(sb, "There are errors in the filesystem, "
32 "so online resizing is not allowed\n");
33 return -EPERM;
34 }
35
36 if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
37 ret = -EBUSY;
38
39 return ret;
40}
41
42void ext4_resize_end(struct super_block *sb)
43{
44 clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
45 smp_mb__after_clear_bit();
46}
47
19#define outside(b, first, last) ((b) < (first) || (b) >= (last)) 48#define outside(b, first, last) ((b) < (first) || (b) >= (last))
20#define inside(b, first, last) ((b) >= (first) && (b) < (last)) 49#define inside(b, first, last) ((b) >= (first) && (b) < (last))
21 50
@@ -118,10 +147,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
118 brelse(bh); 147 brelse(bh);
119 bh = ERR_PTR(err); 148 bh = ERR_PTR(err);
120 } else { 149 } else {
121 lock_buffer(bh);
122 memset(bh->b_data, 0, sb->s_blocksize); 150 memset(bh->b_data, 0, sb->s_blocksize);
123 set_buffer_uptodate(bh); 151 set_buffer_uptodate(bh);
124 unlock_buffer(bh);
125 } 152 }
126 153
127 return bh; 154 return bh;
@@ -132,8 +159,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
132 * If that fails, restart the transaction & regain write access for the 159 * If that fails, restart the transaction & regain write access for the
133 * buffer head which is used for block_bitmap modifications. 160 * buffer head which is used for block_bitmap modifications.
134 */ 161 */
135static int extend_or_restart_transaction(handle_t *handle, int thresh, 162static int extend_or_restart_transaction(handle_t *handle, int thresh)
136 struct buffer_head *bh)
137{ 163{
138 int err; 164 int err;
139 165
@@ -144,9 +170,8 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
144 if (err < 0) 170 if (err < 0)
145 return err; 171 return err;
146 if (err) { 172 if (err) {
147 if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) 173 err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA);
148 return err; 174 if (err)
149 if ((err = ext4_journal_get_write_access(handle, bh)))
150 return err; 175 return err;
151 } 176 }
152 177
@@ -181,21 +206,7 @@ static int setup_new_group_blocks(struct super_block *sb,
181 if (IS_ERR(handle)) 206 if (IS_ERR(handle))
182 return PTR_ERR(handle); 207 return PTR_ERR(handle);
183 208
184 mutex_lock(&sbi->s_resize_lock); 209 BUG_ON(input->group != sbi->s_groups_count);
185 if (input->group != sbi->s_groups_count) {
186 err = -EBUSY;
187 goto exit_journal;
188 }
189
190 if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) {
191 err = PTR_ERR(bh);
192 goto exit_journal;
193 }
194
195 if (ext4_bg_has_super(sb, input->group)) {
196 ext4_debug("mark backup superblock %#04llx (+0)\n", start);
197 ext4_set_bit(0, bh->b_data);
198 }
199 210
200 /* Copy all of the GDT blocks into the backup in this group */ 211 /* Copy all of the GDT blocks into the backup in this group */
201 for (i = 0, bit = 1, block = start + 1; 212 for (i = 0, bit = 1, block = start + 1;
@@ -203,29 +214,26 @@ static int setup_new_group_blocks(struct super_block *sb,
203 struct buffer_head *gdb; 214 struct buffer_head *gdb;
204 215
205 ext4_debug("update backup group %#04llx (+%d)\n", block, bit); 216 ext4_debug("update backup group %#04llx (+%d)\n", block, bit);
206 217 err = extend_or_restart_transaction(handle, 1);
207 if ((err = extend_or_restart_transaction(handle, 1, bh))) 218 if (err)
208 goto exit_bh; 219 goto exit_journal;
209 220
210 gdb = sb_getblk(sb, block); 221 gdb = sb_getblk(sb, block);
211 if (!gdb) { 222 if (!gdb) {
212 err = -EIO; 223 err = -EIO;
213 goto exit_bh; 224 goto exit_journal;
214 } 225 }
215 if ((err = ext4_journal_get_write_access(handle, gdb))) { 226 if ((err = ext4_journal_get_write_access(handle, gdb))) {
216 brelse(gdb); 227 brelse(gdb);
217 goto exit_bh; 228 goto exit_journal;
218 } 229 }
219 lock_buffer(gdb);
220 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 230 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size);
221 set_buffer_uptodate(gdb); 231 set_buffer_uptodate(gdb);
222 unlock_buffer(gdb);
223 err = ext4_handle_dirty_metadata(handle, NULL, gdb); 232 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
224 if (unlikely(err)) { 233 if (unlikely(err)) {
225 brelse(gdb); 234 brelse(gdb);
226 goto exit_bh; 235 goto exit_journal;
227 } 236 }
228 ext4_set_bit(bit, bh->b_data);
229 brelse(gdb); 237 brelse(gdb);
230 } 238 }
231 239
@@ -235,9 +243,22 @@ static int setup_new_group_blocks(struct super_block *sb,
235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, 243 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
236 GFP_NOFS); 244 GFP_NOFS);
237 if (err) 245 if (err)
238 goto exit_bh; 246 goto exit_journal;
239 for (i = 0, bit = gdblocks + 1; i < reserved_gdb; i++, bit++) 247
240 ext4_set_bit(bit, bh->b_data); 248 err = extend_or_restart_transaction(handle, 2);
249 if (err)
250 goto exit_journal;
251
252 bh = bclean(handle, sb, input->block_bitmap);
253 if (IS_ERR(bh)) {
254 err = PTR_ERR(bh);
255 goto exit_journal;
256 }
257
258 if (ext4_bg_has_super(sb, input->group)) {
259 ext4_debug("mark backup group tables %#04llx (+0)\n", start);
260 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1);
261 }
241 262
242 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 263 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap,
243 input->block_bitmap - start); 264 input->block_bitmap - start);
@@ -253,12 +274,9 @@ static int setup_new_group_blocks(struct super_block *sb,
253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 274 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
254 if (err) 275 if (err)
255 goto exit_bh; 276 goto exit_bh;
256 for (i = 0, bit = input->inode_table - start; 277 ext4_set_bits(bh->b_data, input->inode_table - start,
257 i < sbi->s_itb_per_group; i++, bit++) 278 sbi->s_itb_per_group);
258 ext4_set_bit(bit, bh->b_data);
259 279
260 if ((err = extend_or_restart_transaction(handle, 2, bh)))
261 goto exit_bh;
262 280
263 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, 281 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8,
264 bh->b_data); 282 bh->b_data);
@@ -285,7 +303,6 @@ exit_bh:
285 brelse(bh); 303 brelse(bh);
286 304
287exit_journal: 305exit_journal:
288 mutex_unlock(&sbi->s_resize_lock);
289 if ((err2 = ext4_journal_stop(handle)) && !err) 306 if ((err2 = ext4_journal_stop(handle)) && !err)
290 err = err2; 307 err = err2;
291 308
@@ -377,15 +394,15 @@ static int verify_reserved_gdb(struct super_block *sb,
377 * fail once we start modifying the data on disk, because JBD has no rollback. 394 * fail once we start modifying the data on disk, because JBD has no rollback.
378 */ 395 */
379static int add_new_gdb(handle_t *handle, struct inode *inode, 396static int add_new_gdb(handle_t *handle, struct inode *inode,
380 struct ext4_new_group_data *input, 397 ext4_group_t group)
381 struct buffer_head **primary)
382{ 398{
383 struct super_block *sb = inode->i_sb; 399 struct super_block *sb = inode->i_sb;
384 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 400 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
385 unsigned long gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 401 unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
386 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; 402 ext4_fsblk_t gdblock = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num;
387 struct buffer_head **o_group_desc, **n_group_desc; 403 struct buffer_head **o_group_desc, **n_group_desc;
388 struct buffer_head *dind; 404 struct buffer_head *dind;
405 struct buffer_head *gdb_bh;
389 int gdbackups; 406 int gdbackups;
390 struct ext4_iloc iloc; 407 struct ext4_iloc iloc;
391 __le32 *data; 408 __le32 *data;
@@ -408,11 +425,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
408 return -EPERM; 425 return -EPERM;
409 } 426 }
410 427
411 *primary = sb_bread(sb, gdblock); 428 gdb_bh = sb_bread(sb, gdblock);
412 if (!*primary) 429 if (!gdb_bh)
413 return -EIO; 430 return -EIO;
414 431
415 if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { 432 gdbackups = verify_reserved_gdb(sb, gdb_bh);
433 if (gdbackups < 0) {
416 err = gdbackups; 434 err = gdbackups;
417 goto exit_bh; 435 goto exit_bh;
418 } 436 }
@@ -427,7 +445,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
427 data = (__le32 *)dind->b_data; 445 data = (__le32 *)dind->b_data;
428 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { 446 if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) {
429 ext4_warning(sb, "new group %u GDT block %llu not reserved", 447 ext4_warning(sb, "new group %u GDT block %llu not reserved",
430 input->group, gdblock); 448 group, gdblock);
431 err = -EINVAL; 449 err = -EINVAL;
432 goto exit_dind; 450 goto exit_dind;
433 } 451 }
@@ -436,7 +454,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
436 if (unlikely(err)) 454 if (unlikely(err))
437 goto exit_dind; 455 goto exit_dind;
438 456
439 err = ext4_journal_get_write_access(handle, *primary); 457 err = ext4_journal_get_write_access(handle, gdb_bh);
440 if (unlikely(err)) 458 if (unlikely(err))
441 goto exit_sbh; 459 goto exit_sbh;
442 460
@@ -449,12 +467,13 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
449 if (unlikely(err)) 467 if (unlikely(err))
450 goto exit_dindj; 468 goto exit_dindj;
451 469
452 n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), 470 n_group_desc = ext4_kvmalloc((gdb_num + 1) *
453 GFP_NOFS); 471 sizeof(struct buffer_head *),
472 GFP_NOFS);
454 if (!n_group_desc) { 473 if (!n_group_desc) {
455 err = -ENOMEM; 474 err = -ENOMEM;
456 ext4_warning(sb, 475 ext4_warning(sb, "not enough memory for %lu groups",
457 "not enough memory for %lu groups", gdb_num + 1); 476 gdb_num + 1);
458 goto exit_inode; 477 goto exit_inode;
459 } 478 }
460 479
@@ -475,8 +494,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
475 } 494 }
476 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; 495 inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9;
477 ext4_mark_iloc_dirty(handle, inode, &iloc); 496 ext4_mark_iloc_dirty(handle, inode, &iloc);
478 memset((*primary)->b_data, 0, sb->s_blocksize); 497 memset(gdb_bh->b_data, 0, sb->s_blocksize);
479 err = ext4_handle_dirty_metadata(handle, NULL, *primary); 498 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
480 if (unlikely(err)) { 499 if (unlikely(err)) {
481 ext4_std_error(sb, err); 500 ext4_std_error(sb, err);
482 goto exit_inode; 501 goto exit_inode;
@@ -486,10 +505,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
486 o_group_desc = EXT4_SB(sb)->s_group_desc; 505 o_group_desc = EXT4_SB(sb)->s_group_desc;
487 memcpy(n_group_desc, o_group_desc, 506 memcpy(n_group_desc, o_group_desc,
488 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); 507 EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
489 n_group_desc[gdb_num] = *primary; 508 n_group_desc[gdb_num] = gdb_bh;
490 EXT4_SB(sb)->s_group_desc = n_group_desc; 509 EXT4_SB(sb)->s_group_desc = n_group_desc;
491 EXT4_SB(sb)->s_gdb_count++; 510 EXT4_SB(sb)->s_gdb_count++;
492 kfree(o_group_desc); 511 ext4_kvfree(o_group_desc);
493 512
494 le16_add_cpu(&es->s_reserved_gdt_blocks, -1); 513 le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
495 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); 514 err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
@@ -499,6 +518,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
499 return err; 518 return err;
500 519
501exit_inode: 520exit_inode:
521 ext4_kvfree(n_group_desc);
502 /* ext4_handle_release_buffer(handle, iloc.bh); */ 522 /* ext4_handle_release_buffer(handle, iloc.bh); */
503 brelse(iloc.bh); 523 brelse(iloc.bh);
504exit_dindj: 524exit_dindj:
@@ -508,7 +528,7 @@ exit_sbh:
508exit_dind: 528exit_dind:
509 brelse(dind); 529 brelse(dind);
510exit_bh: 530exit_bh:
511 brelse(*primary); 531 brelse(gdb_bh);
512 532
513 ext4_debug("leaving with error %d\n", err); 533 ext4_debug("leaving with error %d\n", err);
514 return err; 534 return err;
@@ -528,7 +548,7 @@ exit_bh:
528 * backup GDT blocks are stored in their reserved primary GDT block. 548 * backup GDT blocks are stored in their reserved primary GDT block.
529 */ 549 */
530static int reserve_backup_gdb(handle_t *handle, struct inode *inode, 550static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
531 struct ext4_new_group_data *input) 551 ext4_group_t group)
532{ 552{
533 struct super_block *sb = inode->i_sb; 553 struct super_block *sb = inode->i_sb;
534 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks); 554 int reserved_gdb =le16_to_cpu(EXT4_SB(sb)->s_es->s_reserved_gdt_blocks);
@@ -599,7 +619,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
599 * Finally we can add each of the reserved backup GDT blocks from 619 * Finally we can add each of the reserved backup GDT blocks from
600 * the new group to its reserved primary GDT block. 620 * the new group to its reserved primary GDT block.
601 */ 621 */
602 blk = input->group * EXT4_BLOCKS_PER_GROUP(sb); 622 blk = group * EXT4_BLOCKS_PER_GROUP(sb);
603 for (i = 0; i < reserved_gdb; i++) { 623 for (i = 0; i < reserved_gdb; i++) {
604 int err2; 624 int err2;
605 data = (__le32 *)primary[i]->b_data; 625 data = (__le32 *)primary[i]->b_data;
@@ -799,13 +819,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
799 goto exit_put; 819 goto exit_put;
800 } 820 }
801 821
802 mutex_lock(&sbi->s_resize_lock);
803 if (input->group != sbi->s_groups_count) {
804 ext4_warning(sb, "multiple resizers run on filesystem!");
805 err = -EBUSY;
806 goto exit_journal;
807 }
808
809 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) 822 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
810 goto exit_journal; 823 goto exit_journal;
811 824
@@ -820,16 +833,25 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
820 if ((err = ext4_journal_get_write_access(handle, primary))) 833 if ((err = ext4_journal_get_write_access(handle, primary)))
821 goto exit_journal; 834 goto exit_journal;
822 835
823 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group) && 836 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
824 (err = reserve_backup_gdb(handle, inode, input))) 837 err = reserve_backup_gdb(handle, inode, input->group);
838 if (err)
839 goto exit_journal;
840 }
841 } else {
842 /*
843 * Note that we can access new group descriptor block safely
844 * only if add_new_gdb() succeeds.
845 */
846 err = add_new_gdb(handle, inode, input->group);
847 if (err)
825 goto exit_journal; 848 goto exit_journal;
826 } else if ((err = add_new_gdb(handle, inode, input, &primary))) 849 primary = sbi->s_group_desc[gdb_num];
827 goto exit_journal; 850 }
828 851
829 /* 852 /*
830 * OK, now we've set up the new group. Time to make it active. 853 * OK, now we've set up the new group. Time to make it active.
831 * 854 *
832 * We do not lock all allocations via s_resize_lock
833 * so we have to be safe wrt. concurrent accesses the group 855 * so we have to be safe wrt. concurrent accesses the group
834 * data. So we need to be careful to set all of the relevant 856 * data. So we need to be careful to set all of the relevant
835 * group descriptor data etc. *before* we enable the group. 857 * group descriptor data etc. *before* we enable the group.
@@ -886,13 +908,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
886 * 908 *
887 * The precise rules we use are: 909 * The precise rules we use are:
888 * 910 *
889 * * Writers of s_groups_count *must* hold s_resize_lock
890 * AND
891 * * Writers must perform a smp_wmb() after updating all dependent 911 * * Writers must perform a smp_wmb() after updating all dependent
892 * data and before modifying the groups count 912 * data and before modifying the groups count
893 * 913 *
894 * * Readers must hold s_resize_lock over the access
895 * OR
896 * * Readers must perform an smp_rmb() after reading the groups count 914 * * Readers must perform an smp_rmb() after reading the groups count
897 * and before reading any dependent data. 915 * and before reading any dependent data.
898 * 916 *
@@ -937,10 +955,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
937 ext4_handle_dirty_super(handle, sb); 955 ext4_handle_dirty_super(handle, sb);
938 956
939exit_journal: 957exit_journal:
940 mutex_unlock(&sbi->s_resize_lock);
941 if ((err2 = ext4_journal_stop(handle)) && !err) 958 if ((err2 = ext4_journal_stop(handle)) && !err)
942 err = err2; 959 err = err2;
943 if (!err) { 960 if (!err && primary) {
944 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 961 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
945 sizeof(struct ext4_super_block)); 962 sizeof(struct ext4_super_block));
946 update_backups(sb, primary->b_blocknr, primary->b_data, 963 update_backups(sb, primary->b_blocknr, primary->b_data,
@@ -969,16 +986,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
969 ext4_grpblk_t add; 986 ext4_grpblk_t add;
970 struct buffer_head *bh; 987 struct buffer_head *bh;
971 handle_t *handle; 988 handle_t *handle;
972 int err; 989 int err, err2;
973 ext4_group_t group; 990 ext4_group_t group;
974 991
975 /* We don't need to worry about locking wrt other resizers just
976 * yet: we're going to revalidate es->s_blocks_count after
977 * taking the s_resize_lock below. */
978 o_blocks_count = ext4_blocks_count(es); 992 o_blocks_count = ext4_blocks_count(es);
979 993
980 if (test_opt(sb, DEBUG)) 994 if (test_opt(sb, DEBUG))
981 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", 995 printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
982 o_blocks_count, n_blocks_count); 996 o_blocks_count, n_blocks_count);
983 997
984 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 998 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
@@ -995,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
995 1009
996 if (n_blocks_count < o_blocks_count) { 1010 if (n_blocks_count < o_blocks_count) {
997 ext4_warning(sb, "can't shrink FS - resize aborted"); 1011 ext4_warning(sb, "can't shrink FS - resize aborted");
998 return -EBUSY; 1012 return -EINVAL;
999 } 1013 }
1000 1014
1001 /* Handle the remaining blocks in the last group only. */ 1015 /* Handle the remaining blocks in the last group only. */
@@ -1038,32 +1052,25 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1038 goto exit_put; 1052 goto exit_put;
1039 } 1053 }
1040 1054
1041 mutex_lock(&EXT4_SB(sb)->s_resize_lock);
1042 if (o_blocks_count != ext4_blocks_count(es)) {
1043 ext4_warning(sb, "multiple resizers run on filesystem!");
1044 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1045 ext4_journal_stop(handle);
1046 err = -EBUSY;
1047 goto exit_put;
1048 }
1049
1050 if ((err = ext4_journal_get_write_access(handle, 1055 if ((err = ext4_journal_get_write_access(handle,
1051 EXT4_SB(sb)->s_sbh))) { 1056 EXT4_SB(sb)->s_sbh))) {
1052 ext4_warning(sb, "error %d on journal write access", err); 1057 ext4_warning(sb, "error %d on journal write access", err);
1053 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1054 ext4_journal_stop(handle); 1058 ext4_journal_stop(handle);
1055 goto exit_put; 1059 goto exit_put;
1056 } 1060 }
1057 ext4_blocks_count_set(es, o_blocks_count + add); 1061 ext4_blocks_count_set(es, o_blocks_count + add);
1058 mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
1059 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, 1062 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1060 o_blocks_count + add); 1063 o_blocks_count + add);
1061 /* We add the blocks to the bitmap and set the group need init bit */ 1064 /* We add the blocks to the bitmap and set the group need init bit */
1062 ext4_add_groupblocks(handle, sb, o_blocks_count, add); 1065 err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
1063 ext4_handle_dirty_super(handle, sb); 1066 ext4_handle_dirty_super(handle, sb);
1064 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, 1067 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1065 o_blocks_count + add); 1068 o_blocks_count + add);
1066 if ((err = ext4_journal_stop(handle))) 1069 err2 = ext4_journal_stop(handle);
1070 if (!err && err2)
1071 err = err2;
1072
1073 if (err)
1067 goto exit_put; 1074 goto exit_put;
1068 1075
1069 if (test_opt(sb, DEBUG)) 1076 if (test_opt(sb, DEBUG))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9ea71aa864b3..4687fea0c00f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -110,6 +110,35 @@ static struct file_system_type ext3_fs_type = {
110#define IS_EXT3_SB(sb) (0) 110#define IS_EXT3_SB(sb) (0)
111#endif 111#endif
112 112
113void *ext4_kvmalloc(size_t size, gfp_t flags)
114{
115 void *ret;
116
117 ret = kmalloc(size, flags);
118 if (!ret)
119 ret = __vmalloc(size, flags, PAGE_KERNEL);
120 return ret;
121}
122
123void *ext4_kvzalloc(size_t size, gfp_t flags)
124{
125 void *ret;
126
127 ret = kzalloc(size, flags);
128 if (!ret)
129 ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
130 return ret;
131}
132
133void ext4_kvfree(void *ptr)
134{
135 if (is_vmalloc_addr(ptr))
136 vfree(ptr);
137 else
138 kfree(ptr);
139
140}
141
113ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, 142ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
114 struct ext4_group_desc *bg) 143 struct ext4_group_desc *bg)
115{ 144{
@@ -269,6 +298,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
269 journal_t *journal; 298 journal_t *journal;
270 handle_t *handle; 299 handle_t *handle;
271 300
301 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
272 if (sb->s_flags & MS_RDONLY) 302 if (sb->s_flags & MS_RDONLY)
273 return ERR_PTR(-EROFS); 303 return ERR_PTR(-EROFS);
274 304
@@ -789,11 +819,8 @@ static void ext4_put_super(struct super_block *sb)
789 819
790 for (i = 0; i < sbi->s_gdb_count; i++) 820 for (i = 0; i < sbi->s_gdb_count; i++)
791 brelse(sbi->s_group_desc[i]); 821 brelse(sbi->s_group_desc[i]);
792 kfree(sbi->s_group_desc); 822 ext4_kvfree(sbi->s_group_desc);
793 if (is_vmalloc_addr(sbi->s_flex_groups)) 823 ext4_kvfree(sbi->s_flex_groups);
794 vfree(sbi->s_flex_groups);
795 else
796 kfree(sbi->s_flex_groups);
797 percpu_counter_destroy(&sbi->s_freeblocks_counter); 824 percpu_counter_destroy(&sbi->s_freeblocks_counter);
798 percpu_counter_destroy(&sbi->s_freeinodes_counter); 825 percpu_counter_destroy(&sbi->s_freeinodes_counter);
799 percpu_counter_destroy(&sbi->s_dirs_counter); 826 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1976,15 +2003,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
1976 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 2003 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
1977 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; 2004 EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
1978 size = flex_group_count * sizeof(struct flex_groups); 2005 size = flex_group_count * sizeof(struct flex_groups);
1979 sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); 2006 sbi->s_flex_groups = ext4_kvzalloc(size, GFP_KERNEL);
1980 if (sbi->s_flex_groups == NULL) { 2007 if (sbi->s_flex_groups == NULL) {
1981 sbi->s_flex_groups = vzalloc(size); 2008 ext4_msg(sb, KERN_ERR, "not enough memory for %u flex groups",
1982 if (sbi->s_flex_groups == NULL) { 2009 flex_group_count);
1983 ext4_msg(sb, KERN_ERR, 2010 goto failed;
1984 "not enough memory for %u flex groups",
1985 flex_group_count);
1986 goto failed;
1987 }
1988 } 2011 }
1989 2012
1990 for (i = 0; i < sbi->s_groups_count; i++) { 2013 for (i = 0; i < sbi->s_groups_count; i++) {
@@ -2383,17 +2406,25 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2383 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride); 2406 unsigned long stride = le16_to_cpu(sbi->s_es->s_raid_stride);
2384 unsigned long stripe_width = 2407 unsigned long stripe_width =
2385 le32_to_cpu(sbi->s_es->s_raid_stripe_width); 2408 le32_to_cpu(sbi->s_es->s_raid_stripe_width);
2409 int ret;
2386 2410
2387 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group) 2411 if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
2388 return sbi->s_stripe; 2412 ret = sbi->s_stripe;
2389 2413 else if (stripe_width <= sbi->s_blocks_per_group)
2390 if (stripe_width <= sbi->s_blocks_per_group) 2414 ret = stripe_width;
2391 return stripe_width; 2415 else if (stride <= sbi->s_blocks_per_group)
2416 ret = stride;
2417 else
2418 ret = 0;
2392 2419
2393 if (stride <= sbi->s_blocks_per_group) 2420 /*
2394 return stride; 2421 * If the stripe width is 1, this makes no sense and
2422 * we set it to 0 to turn off stripe handling code.
2423 */
2424 if (ret <= 1)
2425 ret = 0;
2395 2426
2396 return 0; 2427 return ret;
2397} 2428}
2398 2429
2399/* sysfs supprt */ 2430/* sysfs supprt */
@@ -3408,8 +3439,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3408 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); 3439 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
3409 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 3440 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
3410 EXT4_DESC_PER_BLOCK(sb); 3441 EXT4_DESC_PER_BLOCK(sb);
3411 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 3442 sbi->s_group_desc = ext4_kvmalloc(db_count *
3412 GFP_KERNEL); 3443 sizeof(struct buffer_head *),
3444 GFP_KERNEL);
3413 if (sbi->s_group_desc == NULL) { 3445 if (sbi->s_group_desc == NULL) {
3414 ext4_msg(sb, KERN_ERR, "not enough memory"); 3446 ext4_msg(sb, KERN_ERR, "not enough memory");
3415 goto failed_mount; 3447 goto failed_mount;
@@ -3491,7 +3523,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3491 3523
3492 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3524 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3493 mutex_init(&sbi->s_orphan_lock); 3525 mutex_init(&sbi->s_orphan_lock);
3494 mutex_init(&sbi->s_resize_lock); 3526 sbi->s_resize_flags = 0;
3495 3527
3496 sb->s_root = NULL; 3528 sb->s_root = NULL;
3497 3529
@@ -3741,12 +3773,8 @@ failed_mount_wq:
3741 } 3773 }
3742failed_mount3: 3774failed_mount3:
3743 del_timer(&sbi->s_err_report); 3775 del_timer(&sbi->s_err_report);
3744 if (sbi->s_flex_groups) { 3776 if (sbi->s_flex_groups)
3745 if (is_vmalloc_addr(sbi->s_flex_groups)) 3777 ext4_kvfree(sbi->s_flex_groups);
3746 vfree(sbi->s_flex_groups);
3747 else
3748 kfree(sbi->s_flex_groups);
3749 }
3750 percpu_counter_destroy(&sbi->s_freeblocks_counter); 3778 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3751 percpu_counter_destroy(&sbi->s_freeinodes_counter); 3779 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3752 percpu_counter_destroy(&sbi->s_dirs_counter); 3780 percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -3756,7 +3784,7 @@ failed_mount3:
3756failed_mount2: 3784failed_mount2:
3757 for (i = 0; i < db_count; i++) 3785 for (i = 0; i < db_count; i++)
3758 brelse(sbi->s_group_desc[i]); 3786 brelse(sbi->s_group_desc[i]);
3759 kfree(sbi->s_group_desc); 3787 ext4_kvfree(sbi->s_group_desc);
3760failed_mount: 3788failed_mount:
3761 if (sbi->s_proc) { 3789 if (sbi->s_proc) {
3762 remove_proc_entry(sb->s_id, ext4_proc_root); 3790 remove_proc_entry(sb->s_id, ext4_proc_root);
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
new file mode 100644
index 000000000000..011ba6670d99
--- /dev/null
+++ b/fs/ext4/truncate.h
@@ -0,0 +1,43 @@
1/*
2 * linux/fs/ext4/truncate.h
3 *
4 * Common inline functions needed for truncate support
5 */
6
7/*
8 * Truncate blocks that were not used by write. We have to truncate the
9 * pagecache as well so that corresponding buffers get properly unmapped.
10 */
11static inline void ext4_truncate_failed_write(struct inode *inode)
12{
13 truncate_inode_pages(inode->i_mapping, inode->i_size);
14 ext4_truncate(inode);
15}
16
17/*
18 * Work out how many blocks we need to proceed with the next chunk of a
19 * truncate transaction.
20 */
21static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
22{
23 ext4_lblk_t needed;
24
25 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
26
27 /* Give ourselves just enough room to cope with inodes in which
28 * i_blocks is corrupt: we've seen disk corruptions in the past
29 * which resulted in random data in an inode which looked enough
30 * like a regular file for ext4 to try to delete it. Things
31 * will go a bit crazy if that happens, but at least we should
32 * try not to panic the whole kernel. */
33 if (needed < 2)
34 needed = 2;
35
36 /* But we need to bound the transaction so we don't overflow the
37 * journal. */
38 if (needed > EXT4_MAX_TRANS_DATA)
39 needed = EXT4_MAX_TRANS_DATA;
40
41 return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
42}
43
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index d5e33a077a67..d0dddaceac59 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -82,18 +82,14 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
82 return PTR_ERR(acl); 82 return PTR_ERR(acl);
83 } 83 }
84 if (acl) { 84 if (acl) {
85 mode_t mode;
86
87 error = posix_acl_valid(acl); 85 error = posix_acl_valid(acl);
88 if (error) 86 if (error)
89 goto failed; 87 goto failed;
90 switch (type) { 88 switch (type) {
91 case ACL_TYPE_ACCESS: 89 case ACL_TYPE_ACCESS:
92 mode = inode->i_mode; 90 error = posix_acl_equiv_mode(acl, &inode->i_mode);
93 error = posix_acl_equiv_mode(acl, &mode);
94 if (error < 0) 91 if (error < 0)
95 goto failed; 92 goto failed;
96 inode->i_mode = mode;
97 inode->i_ctime = CURRENT_TIME; 93 inode->i_ctime = CURRENT_TIME;
98 if (error == 0) { 94 if (error == 0) {
99 posix_acl_release(acl); 95 posix_acl_release(acl);
@@ -125,21 +121,20 @@ int
125generic_acl_init(struct inode *inode, struct inode *dir) 121generic_acl_init(struct inode *inode, struct inode *dir)
126{ 122{
127 struct posix_acl *acl = NULL; 123 struct posix_acl *acl = NULL;
128 mode_t mode = inode->i_mode;
129 int error; 124 int error;
130 125
131 inode->i_mode = mode & ~current_umask();
132 if (!S_ISLNK(inode->i_mode)) 126 if (!S_ISLNK(inode->i_mode))
133 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT); 127 acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
134 if (acl) { 128 if (acl) {
135 if (S_ISDIR(inode->i_mode)) 129 if (S_ISDIR(inode->i_mode))
136 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl); 130 set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
137 error = posix_acl_create(&acl, GFP_KERNEL, &mode); 131 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
138 if (error < 0) 132 if (error < 0)
139 return error; 133 return error;
140 inode->i_mode = mode;
141 if (error > 0) 134 if (error > 0)
142 set_cached_acl(inode, ACL_TYPE_ACCESS, acl); 135 set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
136 } else {
137 inode->i_mode &= ~current_umask();
143 } 138 }
144 error = 0; 139 error = 0;
145 140
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 884c9af0542f..34501b64bc47 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
72 return gfs2_acl_get(GFS2_I(inode), type); 72 return gfs2_acl_get(GFS2_I(inode), type);
73} 73}
74 74
75static int gfs2_set_mode(struct inode *inode, mode_t mode) 75static int gfs2_set_mode(struct inode *inode, umode_t mode)
76{ 76{
77 int error = 0; 77 int error = 0;
78 78
@@ -117,7 +117,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
117{ 117{
118 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 118 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
119 struct posix_acl *acl; 119 struct posix_acl *acl;
120 mode_t mode = inode->i_mode; 120 umode_t mode = inode->i_mode;
121 int error = 0; 121 int error = 0;
122 122
123 if (!sdp->sd_args.ar_posix_acl) 123 if (!sdp->sd_args.ar_posix_acl)
@@ -276,7 +276,7 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
276 goto out_release; 276 goto out_release;
277 277
278 if (type == ACL_TYPE_ACCESS) { 278 if (type == ACL_TYPE_ACCESS) {
279 mode_t mode = inode->i_mode; 279 umode_t mode = inode->i_mode;
280 error = posix_acl_equiv_mode(acl, &mode); 280 error = posix_acl_equiv_mode(acl, &mode);
281 281
282 if (error <= 0) { 282 if (error <= 0) {
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 8635be5ffd97..970ea987b3f6 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -16,6 +16,7 @@
16#include <linux/statfs.h> 16#include <linux/statfs.h>
17#include <linux/types.h> 17#include <linux/types.h>
18#include <linux/pid_namespace.h> 18#include <linux/pid_namespace.h>
19#include <linux/namei.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
20#include "os.h" 21#include "os.h"
21 22
diff --git a/fs/inode.c b/fs/inode.c
index d0c72ff6b30e..73920d555c88 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
143 inode->i_op = &empty_iops; 143 inode->i_op = &empty_iops;
144 inode->i_fop = &empty_fops; 144 inode->i_fop = &empty_fops;
145 inode->i_nlink = 1; 145 inode->i_nlink = 1;
146 inode->i_opflags = 0;
146 inode->i_uid = 0; 147 inode->i_uid = 0;
147 inode->i_gid = 0; 148 inode->i_gid = 0;
148 atomic_set(&inode->i_writecount, 0); 149 atomic_set(&inode->i_writecount, 0);
@@ -399,12 +400,12 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
399EXPORT_SYMBOL(__insert_inode_hash); 400EXPORT_SYMBOL(__insert_inode_hash);
400 401
401/** 402/**
402 * remove_inode_hash - remove an inode from the hash 403 * __remove_inode_hash - remove an inode from the hash
403 * @inode: inode to unhash 404 * @inode: inode to unhash
404 * 405 *
405 * Remove an inode from the superblock. 406 * Remove an inode from the superblock.
406 */ 407 */
407void remove_inode_hash(struct inode *inode) 408void __remove_inode_hash(struct inode *inode)
408{ 409{
409 spin_lock(&inode_hash_lock); 410 spin_lock(&inode_hash_lock);
410 spin_lock(&inode->i_lock); 411 spin_lock(&inode->i_lock);
@@ -412,7 +413,7 @@ void remove_inode_hash(struct inode *inode)
412 spin_unlock(&inode->i_lock); 413 spin_unlock(&inode->i_lock);
413 spin_unlock(&inode_hash_lock); 414 spin_unlock(&inode_hash_lock);
414} 415}
415EXPORT_SYMBOL(remove_inode_hash); 416EXPORT_SYMBOL(__remove_inode_hash);
416 417
417void end_writeback(struct inode *inode) 418void end_writeback(struct inode *inode)
418{ 419{
@@ -454,7 +455,9 @@ static void evict(struct inode *inode)
454 BUG_ON(!(inode->i_state & I_FREEING)); 455 BUG_ON(!(inode->i_state & I_FREEING));
455 BUG_ON(!list_empty(&inode->i_lru)); 456 BUG_ON(!list_empty(&inode->i_lru));
456 457
457 inode_wb_list_del(inode); 458 if (!list_empty(&inode->i_wb_list))
459 inode_wb_list_del(inode);
460
458 inode_sb_list_del(inode); 461 inode_sb_list_del(inode);
459 462
460 if (op->evict_inode) { 463 if (op->evict_inode) {
@@ -1328,7 +1331,8 @@ static void iput_final(struct inode *inode)
1328 } 1331 }
1329 1332
1330 inode->i_state |= I_FREEING; 1333 inode->i_state |= I_FREEING;
1331 inode_lru_list_del(inode); 1334 if (!list_empty(&inode->i_lru))
1335 inode_lru_list_del(inode);
1332 spin_unlock(&inode->i_lock); 1336 spin_unlock(&inode->i_lock);
1333 1337
1334 evict(inode); 1338 evict(inode);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 2c62c5aae82f..16a698bd906d 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -257,9 +257,12 @@ static void
257__flush_batch(journal_t *journal, int *batch_count) 257__flush_batch(journal_t *journal, int *batch_count)
258{ 258{
259 int i; 259 int i;
260 struct blk_plug plug;
260 261
262 blk_start_plug(&plug);
261 for (i = 0; i < *batch_count; i++) 263 for (i = 0; i < *batch_count; i++)
262 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); 264 write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
265 blk_finish_plug(&plug);
263 266
264 for (i = 0; i < *batch_count; i++) { 267 for (i = 0; i < *batch_count; i++) {
265 struct buffer_head *bh = journal->j_chkpt_bhs[i]; 268 struct buffer_head *bh = journal->j_chkpt_bhs[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 0dfa5b598e68..f24df13adc4e 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -2390,73 +2390,6 @@ static void __exit journal_exit(void)
2390 jbd2_journal_destroy_caches(); 2390 jbd2_journal_destroy_caches();
2391} 2391}
2392 2392
2393/*
2394 * jbd2_dev_to_name is a utility function used by the jbd2 and ext4
2395 * tracing infrastructure to map a dev_t to a device name.
2396 *
2397 * The caller should use rcu_read_lock() in order to make sure the
2398 * device name stays valid until its done with it. We use
2399 * rcu_read_lock() as well to make sure we're safe in case the caller
2400 * gets sloppy, and because rcu_read_lock() is cheap and can be safely
2401 * nested.
2402 */
2403struct devname_cache {
2404 struct rcu_head rcu;
2405 dev_t device;
2406 char devname[BDEVNAME_SIZE];
2407};
2408#define CACHE_SIZE_BITS 6
2409static struct devname_cache *devcache[1 << CACHE_SIZE_BITS];
2410static DEFINE_SPINLOCK(devname_cache_lock);
2411
2412static void free_devcache(struct rcu_head *rcu)
2413{
2414 kfree(rcu);
2415}
2416
2417const char *jbd2_dev_to_name(dev_t device)
2418{
2419 int i = hash_32(device, CACHE_SIZE_BITS);
2420 char *ret;
2421 struct block_device *bd;
2422 static struct devname_cache *new_dev;
2423
2424 rcu_read_lock();
2425 if (devcache[i] && devcache[i]->device == device) {
2426 ret = devcache[i]->devname;
2427 rcu_read_unlock();
2428 return ret;
2429 }
2430 rcu_read_unlock();
2431
2432 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2433 if (!new_dev)
2434 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2435 bd = bdget(device);
2436 spin_lock(&devname_cache_lock);
2437 if (devcache[i]) {
2438 if (devcache[i]->device == device) {
2439 kfree(new_dev);
2440 bdput(bd);
2441 ret = devcache[i]->devname;
2442 spin_unlock(&devname_cache_lock);
2443 return ret;
2444 }
2445 call_rcu(&devcache[i]->rcu, free_devcache);
2446 }
2447 devcache[i] = new_dev;
2448 devcache[i]->device = device;
2449 if (bd) {
2450 bdevname(bd, devcache[i]->devname);
2451 bdput(bd);
2452 } else
2453 __bdevname(device, devcache[i]->devname);
2454 ret = devcache[i]->devname;
2455 spin_unlock(&devname_cache_lock);
2456 return ret;
2457}
2458EXPORT_SYMBOL(jbd2_dev_to_name);
2459
2460MODULE_LICENSE("GPL"); 2393MODULE_LICENSE("GPL");
2461module_init(journal_init); 2394module_init(journal_init);
2462module_exit(journal_exit); 2395module_exit(journal_exit);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 27c511a1cf05..926d02068a14 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -227,7 +227,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
227 case ACL_TYPE_ACCESS: 227 case ACL_TYPE_ACCESS:
228 xprefix = JFFS2_XPREFIX_ACL_ACCESS; 228 xprefix = JFFS2_XPREFIX_ACL_ACCESS;
229 if (acl) { 229 if (acl) {
230 mode_t mode = inode->i_mode; 230 umode_t mode = inode->i_mode;
231 rc = posix_acl_equiv_mode(acl, &mode); 231 rc = posix_acl_equiv_mode(acl, &mode);
232 if (rc < 0) 232 if (rc < 0)
233 return rc; 233 return rc;
@@ -259,7 +259,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
259 return rc; 259 return rc;
260} 260}
261 261
262int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, mode_t *i_mode) 262int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
263{ 263{
264 struct posix_acl *acl; 264 struct posix_acl *acl;
265 int rc; 265 int rc;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index b3421c78d9f8..9b477246f2a6 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
28 28
29struct posix_acl *jffs2_get_acl(struct inode *inode, int type); 29struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, mode_t *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern const struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index b81b35ddf4e4..bbcb9755dd2b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -406,7 +406,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
406 406
407/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash, 407/* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
408 fill in the raw_inode while you're at it. */ 408 fill in the raw_inode while you're at it. */
409struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, struct jffs2_raw_inode *ri) 409struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_raw_inode *ri)
410{ 410{
411 struct inode *inode; 411 struct inode *inode;
412 struct super_block *sb = dir_i->i_sb; 412 struct super_block *sb = dir_i->i_sb;
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 526979c607b6..6c1755c59c0f 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -173,7 +173,7 @@ int jffs2_do_setattr (struct inode *, struct iattr *);
173struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
174void jffs2_evict_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
175void jffs2_dirty_inode(struct inode *inode, int flags); 175void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, mode_t mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
179int jffs2_remount_fs (struct super_block *, int *, char *); 179int jffs2_remount_fs (struct super_block *, int *, char *);
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index b3a32caf2b45..45559dc3ea2f 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -127,16 +127,14 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
127 return PTR_ERR(acl); 127 return PTR_ERR(acl);
128 128
129 if (acl) { 129 if (acl) {
130 mode_t mode = inode->i_mode;
131 if (S_ISDIR(inode->i_mode)) { 130 if (S_ISDIR(inode->i_mode)) {
132 rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl); 131 rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
133 if (rc) 132 if (rc)
134 goto cleanup; 133 goto cleanup;
135 } 134 }
136 rc = posix_acl_create(&acl, GFP_KERNEL, &mode); 135 rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
137 if (rc < 0) 136 if (rc < 0)
138 goto cleanup; /* posix_acl_release(NULL) is no-op */ 137 goto cleanup; /* posix_acl_release(NULL) is no-op */
139 inode->i_mode = mode;
140 if (rc > 0) 138 if (rc > 0)
141 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); 139 rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
142cleanup: 140cleanup:
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24838f1eeee5..e87fedef23db 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -693,8 +693,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
693 return rc; 693 return rc;
694 } 694 }
695 if (acl) { 695 if (acl) {
696 mode_t mode = inode->i_mode; 696 rc = posix_acl_equiv_mode(acl, &inode->i_mode);
697 rc = posix_acl_equiv_mode(acl, &mode);
698 posix_acl_release(acl); 697 posix_acl_release(acl);
699 if (rc < 0) { 698 if (rc < 0) {
700 printk(KERN_ERR 699 printk(KERN_ERR
@@ -702,7 +701,6 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
702 rc); 701 rc);
703 return rc; 702 return rc;
704 } 703 }
705 inode->i_mode = mode;
706 mark_inode_dirty(inode); 704 mark_inode_dirty(inode);
707 } 705 }
708 /* 706 /*
diff --git a/fs/namei.c b/fs/namei.c
index f8c69d373793..2826db35dc25 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -179,19 +179,14 @@ static int check_acl(struct inode *inode, int mask)
179#ifdef CONFIG_FS_POSIX_ACL 179#ifdef CONFIG_FS_POSIX_ACL
180 struct posix_acl *acl; 180 struct posix_acl *acl;
181 181
182 /*
183 * Under RCU walk, we cannot even do a "get_cached_acl()",
184 * because that involves locking and getting a refcount on
185 * a cached ACL.
186 *
187 * So the only case we handle during RCU walking is the
188 * case of a cached "no ACL at all", which needs no locks
189 * or refcounts.
190 */
191 if (mask & MAY_NOT_BLOCK) { 182 if (mask & MAY_NOT_BLOCK) {
192 if (negative_cached_acl(inode, ACL_TYPE_ACCESS)) 183 acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
184 if (!acl)
193 return -EAGAIN; 185 return -EAGAIN;
194 return -ECHILD; 186 /* no ->get_acl() calls in RCU mode... */
187 if (acl == ACL_NOT_CACHED)
188 return -ECHILD;
189 return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
195 } 190 }
196 191
197 acl = get_cached_acl(inode, ACL_TYPE_ACCESS); 192 acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
@@ -313,6 +308,26 @@ int generic_permission(struct inode *inode, int mask)
313 return -EACCES; 308 return -EACCES;
314} 309}
315 310
311/*
312 * We _really_ want to just do "generic_permission()" without
313 * even looking at the inode->i_op values. So we keep a cache
314 * flag in inode->i_opflags, that says "this has not special
315 * permission function, use the fast case".
316 */
317static inline int do_inode_permission(struct inode *inode, int mask)
318{
319 if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
320 if (likely(inode->i_op->permission))
321 return inode->i_op->permission(inode, mask);
322
323 /* This gets set once for the inode lifetime */
324 spin_lock(&inode->i_lock);
325 inode->i_opflags |= IOP_FASTPERM;
326 spin_unlock(&inode->i_lock);
327 }
328 return generic_permission(inode, mask);
329}
330
316/** 331/**
317 * inode_permission - check for access rights to a given inode 332 * inode_permission - check for access rights to a given inode
318 * @inode: inode to check permission on 333 * @inode: inode to check permission on
@@ -327,7 +342,7 @@ int inode_permission(struct inode *inode, int mask)
327{ 342{
328 int retval; 343 int retval;
329 344
330 if (mask & MAY_WRITE) { 345 if (unlikely(mask & MAY_WRITE)) {
331 umode_t mode = inode->i_mode; 346 umode_t mode = inode->i_mode;
332 347
333 /* 348 /*
@@ -344,11 +359,7 @@ int inode_permission(struct inode *inode, int mask)
344 return -EACCES; 359 return -EACCES;
345 } 360 }
346 361
347 if (inode->i_op->permission) 362 retval = do_inode_permission(inode, mask);
348 retval = inode->i_op->permission(inode, mask);
349 else
350 retval = generic_permission(inode, mask);
351
352 if (retval) 363 if (retval)
353 return retval; 364 return retval;
354 365
@@ -716,19 +727,25 @@ static int follow_automount(struct path *path, unsigned flags,
716 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT)) 727 if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
717 return -EISDIR; /* we actually want to stop here */ 728 return -EISDIR; /* we actually want to stop here */
718 729
719 /* We want to mount if someone is trying to open/create a file of any 730 /*
720 * type under the mountpoint, wants to traverse through the mountpoint
721 * or wants to open the mounted directory.
722 *
723 * We don't want to mount if someone's just doing a stat and they've 731 * We don't want to mount if someone's just doing a stat and they've
724 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and 732 * set AT_SYMLINK_NOFOLLOW - unless they're stat'ing a directory and
725 * appended a '/' to the name. 733 * appended a '/' to the name.
726 */ 734 */
727 if (!(flags & LOOKUP_FOLLOW) && 735 if (!(flags & LOOKUP_FOLLOW)) {
728 !(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | 736 /* We do, however, want to mount if someone wants to open or
729 LOOKUP_OPEN | LOOKUP_CREATE))) 737 * create a file of any type under the mountpoint, wants to
730 return -EISDIR; 738 * traverse through the mountpoint or wants to open the mounted
731 739 * directory.
740 * Also, autofs may mark negative dentries as being automount
741 * points. These will need the attentions of the daemon to
742 * instantiate them before they can be used.
743 */
744 if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
745 LOOKUP_OPEN | LOOKUP_CREATE)) &&
746 path->dentry->d_inode)
747 return -EISDIR;
748 }
732 current->total_link_count++; 749 current->total_link_count++;
733 if (current->total_link_count >= 40) 750 if (current->total_link_count >= 40)
734 return -ELOOP; 751 return -ELOOP;
@@ -1244,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd)
1244 } 1261 }
1245} 1262}
1246 1263
1264/*
1265 * Do we need to follow links? We _really_ want to be able
1266 * to do this check without having to look at inode->i_op,
1267 * so we keep a cache of "no, this doesn't need follow_link"
1268 * for the common case.
1269 */
1270static inline int should_follow_link(struct inode *inode, int follow)
1271{
1272 if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
1273 if (likely(inode->i_op->follow_link))
1274 return follow;
1275
1276 /* This gets set once for the inode lifetime */
1277 spin_lock(&inode->i_lock);
1278 inode->i_opflags |= IOP_NOFOLLOW;
1279 spin_unlock(&inode->i_lock);
1280 }
1281 return 0;
1282}
1283
1247static inline int walk_component(struct nameidata *nd, struct path *path, 1284static inline int walk_component(struct nameidata *nd, struct path *path,
1248 struct qstr *name, int type, int follow) 1285 struct qstr *name, int type, int follow)
1249{ 1286{
@@ -1266,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
1266 terminate_walk(nd); 1303 terminate_walk(nd);
1267 return -ENOENT; 1304 return -ENOENT;
1268 } 1305 }
1269 if (unlikely(inode->i_op->follow_link) && follow) { 1306 if (should_follow_link(inode, follow)) {
1270 if (nd->flags & LOOKUP_RCU) { 1307 if (nd->flags & LOOKUP_RCU) {
1271 if (unlikely(unlazy_walk(nd, path->dentry))) { 1308 if (unlikely(unlazy_walk(nd, path->dentry))) {
1272 terminate_walk(nd); 1309 terminate_walk(nd);
@@ -1319,6 +1356,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1319} 1356}
1320 1357
1321/* 1358/*
1359 * We really don't want to look at inode->i_op->lookup
1360 * when we don't have to. So we keep a cache bit in
1361 * the inode ->i_opflags field that says "yes, we can
1362 * do lookup on this inode".
1363 */
1364static inline int can_lookup(struct inode *inode)
1365{
1366 if (likely(inode->i_opflags & IOP_LOOKUP))
1367 return 1;
1368 if (likely(!inode->i_op->lookup))
1369 return 0;
1370
1371 /* We do this once for the lifetime of the inode */
1372 spin_lock(&inode->i_lock);
1373 inode->i_opflags |= IOP_LOOKUP;
1374 spin_unlock(&inode->i_lock);
1375 return 1;
1376}
1377
1378/*
1322 * Name resolution. 1379 * Name resolution.
1323 * This is the basic name resolution function, turning a pathname into 1380 * This is the basic name resolution function, turning a pathname into
1324 * the final dentry. We expect 'base' to be positive and a directory. 1381 * the final dentry. We expect 'base' to be positive and a directory.
@@ -1397,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1397 if (err) 1454 if (err)
1398 return err; 1455 return err;
1399 } 1456 }
1457 if (can_lookup(nd->inode))
1458 continue;
1400 err = -ENOTDIR; 1459 err = -ENOTDIR;
1401 if (!nd->inode->i_op->lookup) 1460 break;
1402 break;
1403 continue;
1404 /* here ends the main loop */ 1461 /* here ends the main loop */
1405 1462
1406last_component: 1463last_component:
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index e49e73107e62..7ef23979896d 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -415,7 +415,7 @@ fail:
415} 415}
416 416
417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode, 417int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
418 mode_t mode) 418 umode_t mode)
419{ 419{
420 struct posix_acl *dfacl, *acl; 420 struct posix_acl *dfacl, *acl;
421 int error = 0; 421 int error = 0;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 38053d823eb0..85f1690ca08c 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -316,7 +316,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
316 int flags, struct nfs_open_context *ctx) 316 int flags, struct nfs_open_context *ctx)
317{ 317{
318 struct nfs3_createdata *data; 318 struct nfs3_createdata *data;
319 mode_t mode = sattr->ia_mode; 319 umode_t mode = sattr->ia_mode;
320 int status = -ENOMEM; 320 int status = -ENOMEM;
321 321
322 dprintk("NFS call create %s\n", dentry->d_name.name); 322 dprintk("NFS call create %s\n", dentry->d_name.name);
@@ -562,7 +562,7 @@ static int
562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) 562nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
563{ 563{
564 struct nfs3_createdata *data; 564 struct nfs3_createdata *data;
565 int mode = sattr->ia_mode; 565 umode_t mode = sattr->ia_mode;
566 int status = -ENOMEM; 566 int status = -ENOMEM;
567 567
568 dprintk("NFS call mkdir %s\n", dentry->d_name.name); 568 dprintk("NFS call mkdir %s\n", dentry->d_name.name);
@@ -681,7 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
681 dev_t rdev) 681 dev_t rdev)
682{ 682{
683 struct nfs3_createdata *data; 683 struct nfs3_createdata *data;
684 mode_t mode = sattr->ia_mode; 684 umode_t mode = sattr->ia_mode;
685 int status = -ENOMEM; 685 int status = -ENOMEM;
686 686
687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, 687 dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 783c58d9daf1..a7219075b4de 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -247,7 +247,7 @@ static int ocfs2_set_acl(handle_t *handle,
247 case ACL_TYPE_ACCESS: 247 case ACL_TYPE_ACCESS:
248 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; 248 name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
249 if (acl) { 249 if (acl) {
250 mode_t mode = inode->i_mode; 250 umode_t mode = inode->i_mode;
251 ret = posix_acl_equiv_mode(acl, &mode); 251 ret = posix_acl_equiv_mode(acl, &mode);
252 if (ret < 0) 252 if (ret < 0)
253 return ret; 253 return ret;
@@ -351,7 +351,7 @@ int ocfs2_init_acl(handle_t *handle,
351 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 351 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
352 struct posix_acl *acl = NULL; 352 struct posix_acl *acl = NULL;
353 int ret = 0, ret2; 353 int ret = 0, ret2;
354 mode_t mode; 354 umode_t mode;
355 355
356 if (!S_ISLNK(inode->i_mode)) { 356 if (!S_ISLNK(inode->i_mode)) {
357 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { 357 if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index d43729a760e2..10027b42b7e2 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -149,10 +149,10 @@ posix_acl_valid(const struct posix_acl *acl)
149 * file mode permission bits, or else 1. Returns -E... on error. 149 * file mode permission bits, or else 1. Returns -E... on error.
150 */ 150 */
151int 151int
152posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p) 152posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
153{ 153{
154 const struct posix_acl_entry *pa, *pe; 154 const struct posix_acl_entry *pa, *pe;
155 mode_t mode = 0; 155 umode_t mode = 0;
156 int not_equiv = 0; 156 int not_equiv = 0;
157 157
158 FOREACH_ACL_ENTRY(pa, acl, pe) { 158 FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -188,7 +188,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
188 * Create an ACL representing the file mode permission bits of an inode. 188 * Create an ACL representing the file mode permission bits of an inode.
189 */ 189 */
190struct posix_acl * 190struct posix_acl *
191posix_acl_from_mode(mode_t mode, gfp_t flags) 191posix_acl_from_mode(umode_t mode, gfp_t flags)
192{ 192{
193 struct posix_acl *acl = posix_acl_alloc(3, flags); 193 struct posix_acl *acl = posix_acl_alloc(3, flags);
194 if (!acl) 194 if (!acl)
@@ -279,11 +279,11 @@ check_perm:
279 * system calls. All permissions that are not granted by the acl are removed. 279 * system calls. All permissions that are not granted by the acl are removed.
280 * The permissions in the acl are changed to reflect the mode_p parameter. 280 * The permissions in the acl are changed to reflect the mode_p parameter.
281 */ 281 */
282static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p) 282static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
283{ 283{
284 struct posix_acl_entry *pa, *pe; 284 struct posix_acl_entry *pa, *pe;
285 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; 285 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
286 mode_t mode = *mode_p; 286 umode_t mode = *mode_p;
287 int not_equiv = 0; 287 int not_equiv = 0;
288 288
289 /* assert(atomic_read(acl->a_refcount) == 1); */ 289 /* assert(atomic_read(acl->a_refcount) == 1); */
@@ -336,7 +336,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, mode_t *mode_p)
336/* 336/*
337 * Modify the ACL for the chmod syscall. 337 * Modify the ACL for the chmod syscall.
338 */ 338 */
339static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode) 339static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
340{ 340{
341 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL; 341 struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
342 struct posix_acl_entry *pa, *pe; 342 struct posix_acl_entry *pa, *pe;
@@ -382,7 +382,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, mode_t mode)
382} 382}
383 383
384int 384int
385posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p) 385posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
386{ 386{
387 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 387 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
388 int err = -ENOMEM; 388 int err = -ENOMEM;
@@ -400,7 +400,7 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, mode_t *mode_p)
400EXPORT_SYMBOL(posix_acl_create); 400EXPORT_SYMBOL(posix_acl_create);
401 401
402int 402int
403posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, mode_t mode) 403posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
404{ 404{
405 struct posix_acl *clone = posix_acl_clone(*acl, gfp); 405 struct posix_acl *clone = posix_acl_clone(*acl, gfp);
406 int err = -ENOMEM; 406 int err = -ENOMEM;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 08e3eccf9a12..5eb02069e1b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1118,7 +1118,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1118 * Warn that /proc/pid/oom_adj is deprecated, see 1118 * Warn that /proc/pid/oom_adj is deprecated, see
1119 * Documentation/feature-removal-schedule.txt. 1119 * Documentation/feature-removal-schedule.txt.
1120 */ 1120 */
1121 WARN_ONCE(1, "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", 1121 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
1122 current->comm, task_pid_nr(current), task_pid_nr(task), 1122 current->comm, task_pid_nr(current), task_pid_nr(task),
1123 task_pid_nr(task)); 1123 task_pid_nr(task));
1124 task->signal->oom_adj = oom_adjust; 1124 task->signal->oom_adj = oom_adjust;
@@ -1919,6 +1919,14 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1919 spin_lock(&files->file_lock); 1919 spin_lock(&files->file_lock);
1920 file = fcheck_files(files, fd); 1920 file = fcheck_files(files, fd);
1921 if (file) { 1921 if (file) {
1922 unsigned int f_flags;
1923 struct fdtable *fdt;
1924
1925 fdt = files_fdtable(files);
1926 f_flags = file->f_flags & ~O_CLOEXEC;
1927 if (FD_ISSET(fd, fdt->close_on_exec))
1928 f_flags |= O_CLOEXEC;
1929
1922 if (path) { 1930 if (path) {
1923 *path = file->f_path; 1931 *path = file->f_path;
1924 path_get(&file->f_path); 1932 path_get(&file->f_path);
@@ -1928,7 +1936,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1928 "pos:\t%lli\n" 1936 "pos:\t%lli\n"
1929 "flags:\t0%o\n", 1937 "flags:\t0%o\n",
1930 (long long) file->f_pos, 1938 (long long) file->f_pos,
1931 file->f_flags); 1939 f_flags);
1932 spin_unlock(&files->file_lock); 1940 spin_unlock(&files->file_lock);
1933 put_files_struct(files); 1941 put_files_struct(files);
1934 return 0; 1942 return 0;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 977ed2723845..893b961dcfd8 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -39,8 +39,9 @@
39#define PSTORE_NAMELEN 64 39#define PSTORE_NAMELEN 64
40 40
41struct pstore_private { 41struct pstore_private {
42 struct pstore_info *psi;
43 enum pstore_type_id type;
42 u64 id; 44 u64 id;
43 int (*erase)(u64);
44 ssize_t size; 45 ssize_t size;
45 char data[]; 46 char data[];
46}; 47};
@@ -73,7 +74,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
73{ 74{
74 struct pstore_private *p = dentry->d_inode->i_private; 75 struct pstore_private *p = dentry->d_inode->i_private;
75 76
76 p->erase(p->id); 77 p->psi->erase(p->type, p->id, p->psi);
77 78
78 return simple_unlink(dir, dentry); 79 return simple_unlink(dir, dentry);
79} 80}
@@ -175,8 +176,8 @@ int pstore_is_mounted(void)
175 * Set the mtime & ctime to the date that this record was originally stored. 176 * Set the mtime & ctime to the date that this record was originally stored.
176 */ 177 */
177int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, 178int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
178 char *data, size_t size, 179 char *data, size_t size, struct timespec time,
179 struct timespec time, int (*erase)(u64)) 180 struct pstore_info *psi)
180{ 181{
181 struct dentry *root = pstore_sb->s_root; 182 struct dentry *root = pstore_sb->s_root;
182 struct dentry *dentry; 183 struct dentry *dentry;
@@ -192,8 +193,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
192 private = kmalloc(sizeof *private + size, GFP_KERNEL); 193 private = kmalloc(sizeof *private + size, GFP_KERNEL);
193 if (!private) 194 if (!private)
194 goto fail_alloc; 195 goto fail_alloc;
196 private->type = type;
195 private->id = id; 197 private->id = id;
196 private->erase = erase; 198 private->psi = psi;
197 199
198 switch (type) { 200 switch (type) {
199 case PSTORE_TYPE_DMESG: 201 case PSTORE_TYPE_DMESG:
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 8c9f23eb1645..611c1b3c46fa 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -2,5 +2,5 @@ extern void pstore_set_kmsg_bytes(int);
2extern void pstore_get_records(void); 2extern void pstore_get_records(void);
3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, 3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
4 char *data, size_t size, 4 char *data, size_t size,
5 struct timespec time, int (*erase)(u64)); 5 struct timespec time, struct pstore_info *psi);
6extern int pstore_is_mounted(void); 6extern int pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f2c3ff20ea68..c5300ec31696 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -37,6 +37,8 @@
37static DEFINE_SPINLOCK(pstore_lock); 37static DEFINE_SPINLOCK(pstore_lock);
38static struct pstore_info *psinfo; 38static struct pstore_info *psinfo;
39 39
40static char *backend;
41
40/* How much of the console log to snapshot */ 42/* How much of the console log to snapshot */
41static unsigned long kmsg_bytes = 10240; 43static unsigned long kmsg_bytes = 10240;
42 44
@@ -67,7 +69,8 @@ static void pstore_dump(struct kmsg_dumper *dumper,
67 unsigned long size, total = 0; 69 unsigned long size, total = 0;
68 char *dst, *why; 70 char *dst, *why;
69 u64 id; 71 u64 id;
70 int hsize, part = 1; 72 int hsize;
73 unsigned int part = 1;
71 74
72 if (reason < ARRAY_SIZE(reason_str)) 75 if (reason < ARRAY_SIZE(reason_str))
73 why = reason_str[reason]; 76 why = reason_str[reason];
@@ -78,7 +81,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
78 oopscount++; 81 oopscount++;
79 while (total < kmsg_bytes) { 82 while (total < kmsg_bytes) {
80 dst = psinfo->buf; 83 dst = psinfo->buf;
81 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++); 84 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part);
82 size = psinfo->bufsize - hsize; 85 size = psinfo->bufsize - hsize;
83 dst += hsize; 86 dst += hsize;
84 87
@@ -94,14 +97,16 @@ static void pstore_dump(struct kmsg_dumper *dumper,
94 memcpy(dst, s1 + s1_start, l1_cpy); 97 memcpy(dst, s1 + s1_start, l1_cpy);
95 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); 98 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
96 99
97 id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy); 100 id = psinfo->write(PSTORE_TYPE_DMESG, part,
101 hsize + l1_cpy + l2_cpy, psinfo);
98 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted()) 102 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
99 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, 103 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
100 psinfo->buf, hsize + l1_cpy + l2_cpy, 104 psinfo->buf, hsize + l1_cpy + l2_cpy,
101 CURRENT_TIME, psinfo->erase); 105 CURRENT_TIME, psinfo);
102 l1 -= l1_cpy; 106 l1 -= l1_cpy;
103 l2 -= l2_cpy; 107 l2 -= l2_cpy;
104 total += l1_cpy + l2_cpy; 108 total += l1_cpy + l2_cpy;
109 part++;
105 } 110 }
106 mutex_unlock(&psinfo->buf_mutex); 111 mutex_unlock(&psinfo->buf_mutex);
107} 112}
@@ -128,6 +133,12 @@ int pstore_register(struct pstore_info *psi)
128 spin_unlock(&pstore_lock); 133 spin_unlock(&pstore_lock);
129 return -EBUSY; 134 return -EBUSY;
130 } 135 }
136
137 if (backend && strcmp(backend, psi->name)) {
138 spin_unlock(&pstore_lock);
139 return -EINVAL;
140 }
141
131 psinfo = psi; 142 psinfo = psi;
132 spin_unlock(&pstore_lock); 143 spin_unlock(&pstore_lock);
133 144
@@ -166,9 +177,9 @@ void pstore_get_records(void)
166 if (rc) 177 if (rc)
167 goto out; 178 goto out;
168 179
169 while ((size = psi->read(&id, &type, &time)) > 0) { 180 while ((size = psi->read(&id, &type, &time, psi)) > 0) {
170 if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, 181 if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size,
171 time, psi->erase)) 182 time, psi))
172 failed++; 183 failed++;
173 } 184 }
174 psi->close(psi); 185 psi->close(psi);
@@ -196,12 +207,15 @@ int pstore_write(enum pstore_type_id type, char *buf, size_t size)
196 207
197 mutex_lock(&psinfo->buf_mutex); 208 mutex_lock(&psinfo->buf_mutex);
198 memcpy(psinfo->buf, buf, size); 209 memcpy(psinfo->buf, buf, size);
199 id = psinfo->write(type, size); 210 id = psinfo->write(type, 0, size, psinfo);
200 if (pstore_is_mounted()) 211 if (pstore_is_mounted())
201 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf, 212 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
202 size, CURRENT_TIME, psinfo->erase); 213 size, CURRENT_TIME, psinfo);
203 mutex_unlock(&psinfo->buf_mutex); 214 mutex_unlock(&psinfo->buf_mutex);
204 215
205 return 0; 216 return 0;
206} 217}
207EXPORT_SYMBOL_GPL(pstore_write); 218EXPORT_SYMBOL_GPL(pstore_write);
219
220module_param(backend, charp, 0444);
221MODULE_PARM_DESC(backend, "Pstore backend to use");
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 7362cf4c946a..6da0396e5052 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -272,12 +272,10 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
272 case ACL_TYPE_ACCESS: 272 case ACL_TYPE_ACCESS:
273 name = POSIX_ACL_XATTR_ACCESS; 273 name = POSIX_ACL_XATTR_ACCESS;
274 if (acl) { 274 if (acl) {
275 mode_t mode = inode->i_mode; 275 error = posix_acl_equiv_mode(acl, &inode->i_mode);
276 error = posix_acl_equiv_mode(acl, &mode);
277 if (error < 0) 276 if (error < 0)
278 return error; 277 return error;
279 else { 278 else {
280 inode->i_mode = mode;
281 if (error == 0) 279 if (error == 0)
282 acl = NULL; 280 acl = NULL;
283 } 281 }
@@ -354,8 +352,6 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
354 return PTR_ERR(acl); 352 return PTR_ERR(acl);
355 353
356 if (acl) { 354 if (acl) {
357 mode_t mode = inode->i_mode;
358
359 /* Copy the default ACL to the default ACL of a new directory */ 355 /* Copy the default ACL to the default ACL of a new directory */
360 if (S_ISDIR(inode->i_mode)) { 356 if (S_ISDIR(inode->i_mode)) {
361 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT, 357 err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
@@ -366,12 +362,10 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
366 362
367 /* Now we reconcile the new ACL and the mode, 363 /* Now we reconcile the new ACL and the mode,
368 potentially modifying both */ 364 potentially modifying both */
369 err = posix_acl_create(&acl, GFP_NOFS, &mode); 365 err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
370 if (err < 0) 366 if (err < 0)
371 return err; 367 return err;
372 368
373 inode->i_mode = mode;
374
375 /* If we need an ACL.. */ 369 /* If we need an ACL.. */
376 if (err > 0) 370 if (err > 0)
377 err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl); 371 err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
diff --git a/fs/stack.c b/fs/stack.c
index 4a6f7f440658..b4f2ab48a61f 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -29,10 +29,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
29 * 29 *
30 * We don't actually know what locking is used at the lower level; 30 * We don't actually know what locking is used at the lower level;
31 * but if it's a filesystem that supports quotas, it will be using 31 * but if it's a filesystem that supports quotas, it will be using
32 * i_lock as in inode_add_bytes(). tmpfs uses other locking, and 32 * i_lock as in inode_add_bytes().
33 * its 32-bit is (just) able to exceed 2TB i_size with the aid of
34 * holes; but its i_blocks cannot carry into the upper long without
35 * almost 2TB swap - let's ignore that case.
36 */ 33 */
37 if (sizeof(i_blocks) > sizeof(long)) 34 if (sizeof(i_blocks) > sizeof(long))
38 spin_lock(&src->i_lock); 35 spin_lock(&src->i_lock);
diff --git a/fs/stat.c b/fs/stat.c
index 961039121cb8..ba5316ffac61 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
27 stat->uid = inode->i_uid; 27 stat->uid = inode->i_uid;
28 stat->gid = inode->i_gid; 28 stat->gid = inode->i_gid;
29 stat->rdev = inode->i_rdev; 29 stat->rdev = inode->i_rdev;
30 stat->size = i_size_read(inode);
30 stat->atime = inode->i_atime; 31 stat->atime = inode->i_atime;
31 stat->mtime = inode->i_mtime; 32 stat->mtime = inode->i_mtime;
32 stat->ctime = inode->i_ctime; 33 stat->ctime = inode->i_ctime;
33 stat->size = i_size_read(inode);
34 stat->blocks = inode->i_blocks;
35 stat->blksize = (1 << inode->i_blkbits); 34 stat->blksize = (1 << inode->i_blkbits);
35 stat->blocks = inode->i_blocks;
36} 36}
37 37
38EXPORT_SYMBOL(generic_fillattr); 38EXPORT_SYMBOL(generic_fillattr);
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 44ce51656804..b6c4b3795c4a 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -221,7 +221,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
221} 221}
222 222
223static int 223static int
224xfs_set_mode(struct inode *inode, mode_t mode) 224xfs_set_mode(struct inode *inode, umode_t mode)
225{ 225{
226 int error = 0; 226 int error = 0;
227 227
@@ -267,7 +267,7 @@ posix_acl_default_exists(struct inode *inode)
267int 267int
268xfs_inherit_acl(struct inode *inode, struct posix_acl *acl) 268xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
269{ 269{
270 mode_t mode = inode->i_mode; 270 umode_t mode = inode->i_mode;
271 int error = 0, inherit = 0; 271 int error = 0, inherit = 0;
272 272
273 if (S_ISDIR(inode->i_mode)) { 273 if (S_ISDIR(inode->i_mode)) {
@@ -381,7 +381,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
381 goto out_release; 381 goto out_release;
382 382
383 if (type == ACL_TYPE_ACCESS) { 383 if (type == ACL_TYPE_ACCESS) {
384 mode_t mode = inode->i_mode; 384 umode_t mode = inode->i_mode;
385 error = posix_acl_equiv_mode(acl, &mode); 385 error = posix_acl_equiv_mode(acl, &mode);
386 386
387 if (error <= 0) { 387 if (error <= 0) {