aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorTony Lindgren <tony@atomide.com>2011-06-13 10:40:25 -0400
committerTony Lindgren <tony@atomide.com>2011-06-13 10:40:25 -0400
commitc8e0bf95fc01d6e2ca585fe08010800b6c56e823 (patch)
treef901bdcb5b20e93261cf9cf324ebbcf3fd24ce58 /fs
parent9d5ae7cd6cb9ead43336fec1094184d1dc740fbd (diff)
parent345f79b3de7f6d651e4dba794af7c7303bdfd649 (diff)
Merge branch 'for_3.0/pm-fixes' of ssh://master.kernel.org/pub/scm/linux/kernel/git/khilman/linux-omap-pm into fixes
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/attr.c7
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/bio.c16
-rw-r--r--fs/block_dev.c4
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c28
-rw-r--r--fs/btrfs/ctree.h24
-rw-r--r--fs/btrfs/delayed-inode.c8
-rw-r--r--fs/btrfs/disk-io.c36
-rw-r--r--fs/btrfs/extent-tree.c103
-rw-r--r--fs/btrfs/extent_io.c2
-rw-r--r--fs/btrfs/file.c10
-rw-r--r--fs/btrfs/free-space-cache.c70
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c263
-rw-r--r--fs/btrfs/ioctl.c26
-rw-r--r--fs/btrfs/relocation.c34
-rw-r--r--fs/btrfs/scrub.c123
-rw-r--r--fs/btrfs/super.c8
-rw-r--r--fs/btrfs/transaction.c302
-rw-r--r--fs/btrfs/transaction.h29
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/cifs/cifsacl.c3
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/ecryptfs/crypto.c74
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h26
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c286
-rw-r--r--fs/ecryptfs/main.c84
-rw-r--r--fs/ecryptfs/super.c16
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/inode.c54
-rw-r--r--fs/jffs2/dir.c5
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/namei.c44
-rw-r--r--fs/ncpfs/dir.c15
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.h17
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c96
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c14
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4filelayout.c38
-rw-r--r--fs/nfs/nfs4filelayout.h8
-rw-r--r--fs/nfs/nfs4filelayoutdev.c119
-rw-r--r--fs/nfs/nfs4proc.c107
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c132
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1057
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
-rw-r--r--fs/nfs/pagelist.c62
-rw-r--r--fs/nfs/pnfs.c342
-rw-r--r--fs/nfs/pnfs.h117
-rw-r--r--fs/nfs/pnfs_dev.c270
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c10
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4proc.c73
-rw-r--r--fs/nfsd/nfs4state.c42
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/partitions/check.c10
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/squashfs/export.c2
-rw-r--r--fs/squashfs/fragment.c2
-rw-r--r--fs/squashfs/id.c2
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/recovery.c164
-rw-r--r--fs/ubifs/replay.c3
-rw-r--r--fs/ubifs/shrinker.c9
-rw-r--r--fs/ubifs/super.c44
-rw-r--r--fs/ubifs/tnc.c9
-rw-r--r--fs/ubifs/ubifs.h6
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xattr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
125 files changed, 4779 insertions, 1421 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae29..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
814 814
815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
816{ 816{
817 dentry_unhash(d);
818 return v9fs_remove(i, d, 1); 817 return v9fs_remove(i, d, 1);
819} 818}
820 819
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
840 struct p9_fid *newdirfid; 839 struct p9_fid *newdirfid;
841 struct p9_wstat wstat; 840 struct p9_wstat wstat;
842 841
843 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
844 dentry_unhash(new_dentry);
845
846 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 842 P9_DPRINTK(P9_DEBUG_VFS, "\n");
847 retval = 0; 843 retval = 0;
848 old_inode = old_dentry->d_inode; 844 old_inode = old_dentry->d_inode;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390c..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
320 dentry->d_inode->i_ino, 320 dentry->d_inode->i_ino,
321 (int)dentry->d_name.len, dentry->d_name.name); 321 (int)dentry->d_name.len, dentry->d_name.name);
322 322
323 dentry_unhash(dentry);
324
325 return affs_remove_header(dentry); 323 return affs_remove_header(dentry);
326} 324}
327 325
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
419 struct buffer_head *bh = NULL; 417 struct buffer_head *bh = NULL;
420 int retval; 418 int retval;
421 419
422 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
423 dentry_unhash(new_dentry);
424
425 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", 420 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
426 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 421 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
427 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 422 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e05160042..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
845 _enter("{%x:%u},{%s}", 845 _enter("{%x:%u},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847 847
848 dentry_unhash(dentry);
849
850 ret = -ENAMETOOLONG; 848 ret = -ENAMETOOLONG;
851 if (dentry->d_name.len >= AFSNAMEMAX) 849 if (dentry->d_name.len >= AFSNAMEMAX)
852 goto error; 850 goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1148 struct key *key; 1146 struct key *key;
1149 int ret; 1147 int ret;
1150 1148
1151 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1152 dentry_unhash(new_dentry);
1153
1154 vnode = AFS_FS_I(old_dentry->d_inode); 1149 vnode = AFS_FS_I(old_dentry->d_inode);
1155 orig_dvnode = AFS_FS_I(old_dir); 1150 orig_dvnode = AFS_FS_I(old_dir);
1156 new_dvnode = AFS_FS_I(new_dir); 1151 new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f2..caf2aa521e2b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
175 return -EPERM; 175 return -EPERM;
176 } 176 }
177 177
178 if ((ia_valid & ATTR_MODE)) {
179 mode_t amode = attr->ia_mode;
180 /* Flag setting protected by i_mutex */
181 if (is_sxid(amode))
182 inode->i_flags &= ~S_NOSEC;
183 }
184
178 now = current_fs_time(inode->i_sb); 185 now = current_fs_time(inode->i_sb);
179 186
180 attr->ia_ctime = now; 187 attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddbc..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
584 return -EACCES; 584 return -EACCES;
585 585
586 dentry_unhash(dentry);
587
588 if (atomic_dec_and_test(&ino->count)) { 586 if (atomic_dec_and_test(&ino->count)) {
589 p_ino = autofs4_dentry_ino(dentry->d_parent); 587 p_ino = autofs4_dentry_ino(dentry->d_parent);
590 if (p_ino && dentry->d_parent != dentry) 588 if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b0483..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
224 struct bfs_sb_info *info; 224 struct bfs_sb_info *info;
225 int error = -ENOENT; 225 int error = -ENOENT;
226 226
227 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
228 dentry_unhash(new_dentry);
229
230 old_bh = new_bh = NULL; 227 old_bh = new_bh = NULL;
231 old_inode = old_dentry->d_inode; 228 old_inode = old_dentry->d_inode;
232 if (S_ISDIR(old_inode->i_mode)) 229 if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/bio.c b/fs/bio.c
index 840a0d755248..9bfade8a609b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
638 * @offset: vec entry offset 638 * @offset: vec entry offset
639 * 639 *
640 * Attempt to add a page to the bio_vec maplist. This can fail for a 640 * Attempt to add a page to the bio_vec maplist. This can fail for a
641 * number of reasons, such as the bio being full or target block 641 * number of reasons, such as the bio being full or target block device
642 * device limitations. The target block device must allow bio's 642 * limitations. The target block device must allow bio's up to PAGE_SIZE,
643 * smaller than PAGE_SIZE, so it is always possible to add a single 643 * so it is always possible to add a single page to an empty bio.
644 * page to an empty bio. This should only be used by REQ_PC bios. 644 *
645 * This should only be used by REQ_PC bios.
645 */ 646 */
646int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 647int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
647 unsigned int len, unsigned int offset) 648 unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
659 * @offset: vec entry offset 660 * @offset: vec entry offset
660 * 661 *
661 * Attempt to add a page to the bio_vec maplist. This can fail for a 662 * Attempt to add a page to the bio_vec maplist. This can fail for a
662 * number of reasons, such as the bio being full or target block 663 * number of reasons, such as the bio being full or target block device
663 * device limitations. The target block device must allow bio's 664 * limitations. The target block device must allow bio's up to PAGE_SIZE,
664 * smaller than PAGE_SIZE, so it is always possible to add a single 665 * so it is always possible to add a single page to an empty bio.
665 * page to an empty bio.
666 */ 666 */
667int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 667int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
668 unsigned int offset) 668 unsigned int offset)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1f2b19978333..1a2421f908f0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1272,8 +1272,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1272 * individual writeable reference is too fragile given the 1272 * individual writeable reference is too fragile given the
1273 * way @mode is used in blkdev_get/put(). 1273 * way @mode is used in blkdev_get/put().
1274 */ 1274 */
1275 if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && 1275 if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder &&
1276 !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { 1276 (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
1277 bdev->bd_write_holder = true; 1277 bdev->bd_write_holder = true;
1278 disk_block_events(disk); 1278 disk_block_events(disk);
1279 } 1279 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 93b1aa932014..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -121,9 +121,6 @@ struct btrfs_inode {
121 */ 121 */
122 u64 index_cnt; 122 u64 index_cnt;
123 123
124 /* the start of block group preferred for allocations. */
125 u64 block_group;
126
127 /* the fsync log has some corner cases that mean we have to check 124 /* the fsync log has some corner cases that mean we have to check
128 * directories to see if any unlinks have been done before 125 * directories to see if any unlinks have been done before
129 * the directory was logged. See tree-log.c for all the 126 * the directory was logged. See tree-log.c for all the
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b0e18d986e0a..d84089349c82 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -43,8 +43,6 @@ struct btrfs_path *btrfs_alloc_path(void)
43{ 43{
44 struct btrfs_path *path; 44 struct btrfs_path *path;
45 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 45 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
46 if (path)
47 path->reada = 1;
48 return path; 46 return path;
49} 47}
50 48
@@ -1224,6 +1222,7 @@ static void reada_for_search(struct btrfs_root *root,
1224 u64 search; 1222 u64 search;
1225 u64 target; 1223 u64 target;
1226 u64 nread = 0; 1224 u64 nread = 0;
1225 u64 gen;
1227 int direction = path->reada; 1226 int direction = path->reada;
1228 struct extent_buffer *eb; 1227 struct extent_buffer *eb;
1229 u32 nr; 1228 u32 nr;
@@ -1251,6 +1250,15 @@ static void reada_for_search(struct btrfs_root *root,
1251 nritems = btrfs_header_nritems(node); 1250 nritems = btrfs_header_nritems(node);
1252 nr = slot; 1251 nr = slot;
1253 while (1) { 1252 while (1) {
1253 if (!node->map_token) {
1254 unsigned long offset = btrfs_node_key_ptr_offset(nr);
1255 map_private_extent_buffer(node, offset,
1256 sizeof(struct btrfs_key_ptr),
1257 &node->map_token,
1258 &node->kaddr,
1259 &node->map_start,
1260 &node->map_len, KM_USER1);
1261 }
1254 if (direction < 0) { 1262 if (direction < 0) {
1255 if (nr == 0) 1263 if (nr == 0)
1256 break; 1264 break;
@@ -1268,14 +1276,23 @@ static void reada_for_search(struct btrfs_root *root,
1268 search = btrfs_node_blockptr(node, nr); 1276 search = btrfs_node_blockptr(node, nr);
1269 if ((search <= target && target - search <= 65536) || 1277 if ((search <= target && target - search <= 65536) ||
1270 (search > target && search - target <= 65536)) { 1278 (search > target && search - target <= 65536)) {
1271 readahead_tree_block(root, search, blocksize, 1279 gen = btrfs_node_ptr_generation(node, nr);
1272 btrfs_node_ptr_generation(node, nr)); 1280 if (node->map_token) {
1281 unmap_extent_buffer(node, node->map_token,
1282 KM_USER1);
1283 node->map_token = NULL;
1284 }
1285 readahead_tree_block(root, search, blocksize, gen);
1273 nread += blocksize; 1286 nread += blocksize;
1274 } 1287 }
1275 nscan++; 1288 nscan++;
1276 if ((nread > 65536 || nscan > 32)) 1289 if ((nread > 65536 || nscan > 32))
1277 break; 1290 break;
1278 } 1291 }
1292 if (node->map_token) {
1293 unmap_extent_buffer(node, node->map_token, KM_USER1);
1294 node->map_token = NULL;
1295 }
1279} 1296}
1280 1297
1281/* 1298/*
@@ -1648,9 +1665,6 @@ again:
1648 } 1665 }
1649cow_done: 1666cow_done:
1650 BUG_ON(!cow && ins_len); 1667 BUG_ON(!cow && ins_len);
1651 if (level != btrfs_header_level(b))
1652 WARN_ON(1);
1653 level = btrfs_header_level(b);
1654 1668
1655 p->nodes[level] = b; 1669 p->nodes[level] = b;
1656 if (!p->skip_locking) 1670 if (!p->skip_locking)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332323e19dd1..378b5b4443f3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -930,7 +930,6 @@ struct btrfs_fs_info {
930 * is required instead of the faster short fsync log commits 930 * is required instead of the faster short fsync log commits
931 */ 931 */
932 u64 last_trans_log_full_commit; 932 u64 last_trans_log_full_commit;
933 u64 open_ioctl_trans;
934 unsigned long mount_opt:20; 933 unsigned long mount_opt:20;
935 unsigned long compress_type:4; 934 unsigned long compress_type:4;
936 u64 max_inline; 935 u64 max_inline;
@@ -947,7 +946,6 @@ struct btrfs_fs_info {
947 struct super_block *sb; 946 struct super_block *sb;
948 struct inode *btree_inode; 947 struct inode *btree_inode;
949 struct backing_dev_info bdi; 948 struct backing_dev_info bdi;
950 struct mutex trans_mutex;
951 struct mutex tree_log_mutex; 949 struct mutex tree_log_mutex;
952 struct mutex transaction_kthread_mutex; 950 struct mutex transaction_kthread_mutex;
953 struct mutex cleaner_mutex; 951 struct mutex cleaner_mutex;
@@ -968,6 +966,7 @@ struct btrfs_fs_info {
968 struct rw_semaphore subvol_sem; 966 struct rw_semaphore subvol_sem;
969 struct srcu_struct subvol_srcu; 967 struct srcu_struct subvol_srcu;
970 968
969 spinlock_t trans_lock;
971 struct list_head trans_list; 970 struct list_head trans_list;
972 struct list_head hashers; 971 struct list_head hashers;
973 struct list_head dead_roots; 972 struct list_head dead_roots;
@@ -980,6 +979,7 @@ struct btrfs_fs_info {
980 atomic_t async_submit_draining; 979 atomic_t async_submit_draining;
981 atomic_t nr_async_bios; 980 atomic_t nr_async_bios;
982 atomic_t async_delalloc_pages; 981 atomic_t async_delalloc_pages;
982 atomic_t open_ioctl_trans;
983 983
984 /* 984 /*
985 * this is used by the balancing code to wait for all the pending 985 * this is used by the balancing code to wait for all the pending
@@ -1044,6 +1044,7 @@ struct btrfs_fs_info {
1044 int closing; 1044 int closing;
1045 int log_root_recovering; 1045 int log_root_recovering;
1046 int enospc_unlink; 1046 int enospc_unlink;
1047 int trans_no_join;
1047 1048
1048 u64 total_pinned; 1049 u64 total_pinned;
1049 1050
@@ -1065,7 +1066,6 @@ struct btrfs_fs_info {
1065 struct reloc_control *reloc_ctl; 1066 struct reloc_control *reloc_ctl;
1066 1067
1067 spinlock_t delalloc_lock; 1068 spinlock_t delalloc_lock;
1068 spinlock_t new_trans_lock;
1069 u64 delalloc_bytes; 1069 u64 delalloc_bytes;
1070 1070
1071 /* data_alloc_cluster is only used in ssd mode */ 1071 /* data_alloc_cluster is only used in ssd mode */
@@ -1340,6 +1340,7 @@ struct btrfs_ioctl_defrag_range_args {
1340#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1340#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1341#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1341#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1342#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1342#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1343#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1343 1344
1344#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1345#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1345#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1346#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2238,6 +2239,9 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2238void btrfs_block_rsv_release(struct btrfs_root *root, 2239void btrfs_block_rsv_release(struct btrfs_root *root,
2239 struct btrfs_block_rsv *block_rsv, 2240 struct btrfs_block_rsv *block_rsv,
2240 u64 num_bytes); 2241 u64 num_bytes);
2242int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2243 struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2241int btrfs_set_block_group_ro(struct btrfs_root *root, 2245int btrfs_set_block_group_ro(struct btrfs_root *root,
2242 struct btrfs_block_group_cache *cache); 2246 struct btrfs_block_group_cache *cache);
2243int btrfs_set_block_group_rw(struct btrfs_root *root, 2247int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2350,6 +2354,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2350 struct btrfs_root *root, 2354 struct btrfs_root *root,
2351 struct extent_buffer *node, 2355 struct extent_buffer *node,
2352 struct extent_buffer *parent); 2356 struct extent_buffer *parent);
2357static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2358{
2359 /*
2360 * Get synced with close_ctree()
2361 */
2362 smp_mb();
2363 return fs_info->closing;
2364}
2365
2353/* root-item.c */ 2366/* root-item.c */
2354int btrfs_find_root_ref(struct btrfs_root *tree_root, 2367int btrfs_find_root_ref(struct btrfs_root *tree_root,
2355 struct btrfs_path *path, 2368 struct btrfs_path *path,
@@ -2512,8 +2525,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2512int btrfs_writepages(struct address_space *mapping, 2525int btrfs_writepages(struct address_space *mapping,
2513 struct writeback_control *wbc); 2526 struct writeback_control *wbc);
2514int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2527int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2515 struct btrfs_root *new_root, 2528 struct btrfs_root *new_root, u64 new_dirid);
2516 u64 new_dirid, u64 alloc_hint);
2517int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2529int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2518 size_t size, struct bio *bio, unsigned long bio_flags); 2530 size_t size, struct bio *bio, unsigned long bio_flags);
2519 2531
@@ -2524,7 +2536,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2524int btrfs_readpage(struct file *file, struct page *page); 2536int btrfs_readpage(struct file *file, struct page *page);
2525void btrfs_evict_inode(struct inode *inode); 2537void btrfs_evict_inode(struct inode *inode);
2526int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2538int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2527void btrfs_dirty_inode(struct inode *inode); 2539void btrfs_dirty_inode(struct inode *inode, int flags);
2528struct inode *btrfs_alloc_inode(struct super_block *sb); 2540struct inode *btrfs_alloc_inode(struct super_block *sb);
2529void btrfs_destroy_inode(struct inode *inode); 2541void btrfs_destroy_inode(struct inode *inode);
2530int btrfs_drop_inode(struct inode *inode); 2542int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 01e29503a54b..6462c29d2d37 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -678,6 +678,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
678 INIT_LIST_HEAD(&head); 678 INIT_LIST_HEAD(&head);
679 679
680 next = item; 680 next = item;
681 nitems = 0;
681 682
682 /* 683 /*
683 * count the number of the continuous items that we can insert in batch 684 * count the number of the continuous items that we can insert in batch
@@ -1129,7 +1130,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1129 delayed_node = async_node->delayed_node; 1130 delayed_node = async_node->delayed_node;
1130 root = delayed_node->root; 1131 root = delayed_node->root;
1131 1132
1132 trans = btrfs_join_transaction(root, 0); 1133 trans = btrfs_join_transaction(root);
1133 if (IS_ERR(trans)) 1134 if (IS_ERR(trans))
1134 goto free_path; 1135 goto free_path;
1135 1136
@@ -1572,8 +1573,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1572 btrfs_set_stack_inode_transid(inode_item, trans->transid); 1573 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1573 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); 1574 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1574 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); 1575 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
1575 btrfs_set_stack_inode_block_group(inode_item, 1576 btrfs_set_stack_inode_block_group(inode_item, 0);
1576 BTRFS_I(inode)->block_group);
1577 1577
1578 btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item), 1578 btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
1579 inode->i_atime.tv_sec); 1579 inode->i_atime.tv_sec);
@@ -1595,7 +1595,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1595 struct btrfs_root *root, struct inode *inode) 1595 struct btrfs_root *root, struct inode *inode)
1596{ 1596{
1597 struct btrfs_delayed_node *delayed_node; 1597 struct btrfs_delayed_node *delayed_node;
1598 int ret; 1598 int ret = 0;
1599 1599
1600 delayed_node = btrfs_get_or_create_delayed_node(inode); 1600 delayed_node = btrfs_get_or_create_delayed_node(inode);
1601 if (IS_ERR(delayed_node)) 1601 if (IS_ERR(delayed_node))
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 98b6a71decba..a203d363184d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1505,24 +1505,24 @@ static int transaction_kthread(void *arg)
1505 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1505 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1506 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1506 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1507 1507
1508 spin_lock(&root->fs_info->new_trans_lock); 1508 spin_lock(&root->fs_info->trans_lock);
1509 cur = root->fs_info->running_transaction; 1509 cur = root->fs_info->running_transaction;
1510 if (!cur) { 1510 if (!cur) {
1511 spin_unlock(&root->fs_info->new_trans_lock); 1511 spin_unlock(&root->fs_info->trans_lock);
1512 goto sleep; 1512 goto sleep;
1513 } 1513 }
1514 1514
1515 now = get_seconds(); 1515 now = get_seconds();
1516 if (!cur->blocked && 1516 if (!cur->blocked &&
1517 (now < cur->start_time || now - cur->start_time < 30)) { 1517 (now < cur->start_time || now - cur->start_time < 30)) {
1518 spin_unlock(&root->fs_info->new_trans_lock); 1518 spin_unlock(&root->fs_info->trans_lock);
1519 delay = HZ * 5; 1519 delay = HZ * 5;
1520 goto sleep; 1520 goto sleep;
1521 } 1521 }
1522 transid = cur->transid; 1522 transid = cur->transid;
1523 spin_unlock(&root->fs_info->new_trans_lock); 1523 spin_unlock(&root->fs_info->trans_lock);
1524 1524
1525 trans = btrfs_join_transaction(root, 1); 1525 trans = btrfs_join_transaction(root);
1526 BUG_ON(IS_ERR(trans)); 1526 BUG_ON(IS_ERR(trans));
1527 if (transid == trans->transid) { 1527 if (transid == trans->transid) {
1528 ret = btrfs_commit_transaction(trans, root); 1528 ret = btrfs_commit_transaction(trans, root);
@@ -1613,7 +1613,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 INIT_LIST_HEAD(&fs_info->ordered_operations); 1613 INIT_LIST_HEAD(&fs_info->ordered_operations);
1614 INIT_LIST_HEAD(&fs_info->caching_block_groups); 1614 INIT_LIST_HEAD(&fs_info->caching_block_groups);
1615 spin_lock_init(&fs_info->delalloc_lock); 1615 spin_lock_init(&fs_info->delalloc_lock);
1616 spin_lock_init(&fs_info->new_trans_lock); 1616 spin_lock_init(&fs_info->trans_lock);
1617 spin_lock_init(&fs_info->ref_cache_lock); 1617 spin_lock_init(&fs_info->ref_cache_lock);
1618 spin_lock_init(&fs_info->fs_roots_radix_lock); 1618 spin_lock_init(&fs_info->fs_roots_radix_lock);
1619 spin_lock_init(&fs_info->delayed_iput_lock); 1619 spin_lock_init(&fs_info->delayed_iput_lock);
@@ -1645,6 +1645,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1645 fs_info->max_inline = 8192 * 1024; 1645 fs_info->max_inline = 8192 * 1024;
1646 fs_info->metadata_ratio = 0; 1646 fs_info->metadata_ratio = 0;
1647 fs_info->defrag_inodes = RB_ROOT; 1647 fs_info->defrag_inodes = RB_ROOT;
1648 fs_info->trans_no_join = 0;
1648 1649
1649 fs_info->thread_pool_size = min_t(unsigned long, 1650 fs_info->thread_pool_size = min_t(unsigned long,
1650 num_online_cpus() + 2, 8); 1651 num_online_cpus() + 2, 8);
@@ -1709,7 +1710,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1709 fs_info->do_barriers = 1; 1710 fs_info->do_barriers = 1;
1710 1711
1711 1712
1712 mutex_init(&fs_info->trans_mutex);
1713 mutex_init(&fs_info->ordered_operations_mutex); 1713 mutex_init(&fs_info->ordered_operations_mutex);
1714 mutex_init(&fs_info->tree_log_mutex); 1714 mutex_init(&fs_info->tree_log_mutex);
1715 mutex_init(&fs_info->chunk_mutex); 1715 mutex_init(&fs_info->chunk_mutex);
@@ -2479,13 +2479,13 @@ int btrfs_commit_super(struct btrfs_root *root)
2479 down_write(&root->fs_info->cleanup_work_sem); 2479 down_write(&root->fs_info->cleanup_work_sem);
2480 up_write(&root->fs_info->cleanup_work_sem); 2480 up_write(&root->fs_info->cleanup_work_sem);
2481 2481
2482 trans = btrfs_join_transaction(root, 1); 2482 trans = btrfs_join_transaction(root);
2483 if (IS_ERR(trans)) 2483 if (IS_ERR(trans))
2484 return PTR_ERR(trans); 2484 return PTR_ERR(trans);
2485 ret = btrfs_commit_transaction(trans, root); 2485 ret = btrfs_commit_transaction(trans, root);
2486 BUG_ON(ret); 2486 BUG_ON(ret);
2487 /* run commit again to drop the original snapshot */ 2487 /* run commit again to drop the original snapshot */
2488 trans = btrfs_join_transaction(root, 1); 2488 trans = btrfs_join_transaction(root);
2489 if (IS_ERR(trans)) 2489 if (IS_ERR(trans))
2490 return PTR_ERR(trans); 2490 return PTR_ERR(trans);
2491 btrfs_commit_transaction(trans, root); 2491 btrfs_commit_transaction(trans, root);
@@ -3024,10 +3024,13 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3024 3024
3025 WARN_ON(1); 3025 WARN_ON(1);
3026 3026
3027 mutex_lock(&root->fs_info->trans_mutex);
3028 mutex_lock(&root->fs_info->transaction_kthread_mutex); 3027 mutex_lock(&root->fs_info->transaction_kthread_mutex);
3029 3028
3029 spin_lock(&root->fs_info->trans_lock);
3030 list_splice_init(&root->fs_info->trans_list, &list); 3030 list_splice_init(&root->fs_info->trans_list, &list);
3031 root->fs_info->trans_no_join = 1;
3032 spin_unlock(&root->fs_info->trans_lock);
3033
3031 while (!list_empty(&list)) { 3034 while (!list_empty(&list)) {
3032 t = list_entry(list.next, struct btrfs_transaction, list); 3035 t = list_entry(list.next, struct btrfs_transaction, list);
3033 if (!t) 3036 if (!t)
@@ -3052,23 +3055,18 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3052 t->blocked = 0; 3055 t->blocked = 0;
3053 if (waitqueue_active(&root->fs_info->transaction_wait)) 3056 if (waitqueue_active(&root->fs_info->transaction_wait))
3054 wake_up(&root->fs_info->transaction_wait); 3057 wake_up(&root->fs_info->transaction_wait);
3055 mutex_unlock(&root->fs_info->trans_mutex);
3056 3058
3057 mutex_lock(&root->fs_info->trans_mutex);
3058 t->commit_done = 1; 3059 t->commit_done = 1;
3059 if (waitqueue_active(&t->commit_wait)) 3060 if (waitqueue_active(&t->commit_wait))
3060 wake_up(&t->commit_wait); 3061 wake_up(&t->commit_wait);
3061 mutex_unlock(&root->fs_info->trans_mutex);
3062
3063 mutex_lock(&root->fs_info->trans_mutex);
3064 3062
3065 btrfs_destroy_pending_snapshots(t); 3063 btrfs_destroy_pending_snapshots(t);
3066 3064
3067 btrfs_destroy_delalloc_inodes(root); 3065 btrfs_destroy_delalloc_inodes(root);
3068 3066
3069 spin_lock(&root->fs_info->new_trans_lock); 3067 spin_lock(&root->fs_info->trans_lock);
3070 root->fs_info->running_transaction = NULL; 3068 root->fs_info->running_transaction = NULL;
3071 spin_unlock(&root->fs_info->new_trans_lock); 3069 spin_unlock(&root->fs_info->trans_lock);
3072 3070
3073 btrfs_destroy_marked_extents(root, &t->dirty_pages, 3071 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3074 EXTENT_DIRTY); 3072 EXTENT_DIRTY);
@@ -3082,8 +3080,10 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3082 kmem_cache_free(btrfs_transaction_cachep, t); 3080 kmem_cache_free(btrfs_transaction_cachep, t);
3083 } 3081 }
3084 3082
3083 spin_lock(&root->fs_info->trans_lock);
3084 root->fs_info->trans_no_join = 0;
3085 spin_unlock(&root->fs_info->trans_lock);
3085 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 3086 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3086 mutex_unlock(&root->fs_info->trans_mutex);
3087 3087
3088 return 0; 3088 return 0;
3089} 3089}
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 169bd62ce776..5b9b6b6df242 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -348,7 +348,7 @@ static int caching_kthread(void *data)
348 */ 348 */
349 path->skip_locking = 1; 349 path->skip_locking = 1;
350 path->search_commit_root = 1; 350 path->search_commit_root = 1;
351 path->reada = 2; 351 path->reada = 1;
352 352
353 key.objectid = last; 353 key.objectid = last;
354 key.offset = 0; 354 key.offset = 0;
@@ -366,8 +366,7 @@ again:
366 nritems = btrfs_header_nritems(leaf); 366 nritems = btrfs_header_nritems(leaf);
367 367
368 while (1) { 368 while (1) {
369 smp_mb(); 369 if (btrfs_fs_closing(fs_info) > 1) {
370 if (fs_info->closing > 1) {
371 last = (u64)-1; 370 last = (u64)-1;
372 break; 371 break;
373 } 372 }
@@ -379,15 +378,18 @@ again:
379 if (ret) 378 if (ret)
380 break; 379 break;
381 380
382 caching_ctl->progress = last; 381 if (need_resched() ||
383 btrfs_release_path(path); 382 btrfs_next_leaf(extent_root, path)) {
384 up_read(&fs_info->extent_commit_sem); 383 caching_ctl->progress = last;
385 mutex_unlock(&caching_ctl->mutex); 384 btrfs_release_path(path);
386 if (btrfs_transaction_in_commit(fs_info)) 385 up_read(&fs_info->extent_commit_sem);
387 schedule_timeout(1); 386 mutex_unlock(&caching_ctl->mutex);
388 else
389 cond_resched(); 387 cond_resched();
390 goto again; 388 goto again;
389 }
390 leaf = path->nodes[0];
391 nritems = btrfs_header_nritems(leaf);
392 continue;
391 } 393 }
392 394
393 if (key.objectid < block_group->key.objectid) { 395 if (key.objectid < block_group->key.objectid) {
@@ -3065,7 +3067,7 @@ again:
3065 spin_unlock(&data_sinfo->lock); 3067 spin_unlock(&data_sinfo->lock);
3066alloc: 3068alloc:
3067 alloc_target = btrfs_get_alloc_profile(root, 1); 3069 alloc_target = btrfs_get_alloc_profile(root, 1);
3068 trans = btrfs_join_transaction(root, 1); 3070 trans = btrfs_join_transaction(root);
3069 if (IS_ERR(trans)) 3071 if (IS_ERR(trans))
3070 return PTR_ERR(trans); 3072 return PTR_ERR(trans);
3071 3073
@@ -3091,9 +3093,10 @@ alloc:
3091 3093
3092 /* commit the current transaction and try again */ 3094 /* commit the current transaction and try again */
3093commit_trans: 3095commit_trans:
3094 if (!committed && !root->fs_info->open_ioctl_trans) { 3096 if (!committed &&
3097 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3095 committed = 1; 3098 committed = 1;
3096 trans = btrfs_join_transaction(root, 1); 3099 trans = btrfs_join_transaction(root);
3097 if (IS_ERR(trans)) 3100 if (IS_ERR(trans))
3098 return PTR_ERR(trans); 3101 return PTR_ERR(trans);
3099 ret = btrfs_commit_transaction(trans, root); 3102 ret = btrfs_commit_transaction(trans, root);
@@ -3472,7 +3475,7 @@ again:
3472 goto out; 3475 goto out;
3473 3476
3474 ret = -ENOSPC; 3477 ret = -ENOSPC;
3475 trans = btrfs_join_transaction(root, 1); 3478 trans = btrfs_join_transaction(root);
3476 if (IS_ERR(trans)) 3479 if (IS_ERR(trans))
3477 goto out; 3480 goto out;
3478 ret = btrfs_commit_transaction(trans, root); 3481 ret = btrfs_commit_transaction(trans, root);
@@ -3699,7 +3702,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3699 if (trans) 3702 if (trans)
3700 return -EAGAIN; 3703 return -EAGAIN;
3701 3704
3702 trans = btrfs_join_transaction(root, 1); 3705 trans = btrfs_join_transaction(root);
3703 BUG_ON(IS_ERR(trans)); 3706 BUG_ON(IS_ERR(trans));
3704 ret = btrfs_commit_transaction(trans, root); 3707 ret = btrfs_commit_transaction(trans, root);
3705 return 0; 3708 return 0;
@@ -3837,6 +3840,37 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3837 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 3840 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3838} 3841}
3839 3842
3843int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3844 struct btrfs_root *root,
3845 struct btrfs_block_rsv *rsv)
3846{
3847 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3848 u64 num_bytes;
3849 int ret;
3850
3851 /*
3852 * Truncate should be freeing data, but give us 2 items just in case it
3853 * needs to use some space. We may want to be smarter about this in the
3854 * future.
3855 */
3856 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3857
3858 /* We already have enough bytes, just return */
3859 if (rsv->reserved >= num_bytes)
3860 return 0;
3861
3862 num_bytes -= rsv->reserved;
3863
3864 /*
3865 * You should have reserved enough space before hand to do this, so this
3866 * should not fail.
3867 */
3868 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3869 BUG_ON(ret);
3870
3871 return 0;
3872}
3873
3840int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3874int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3841 struct btrfs_root *root, 3875 struct btrfs_root *root,
3842 int num_items) 3876 int num_items)
@@ -3877,23 +3911,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3877 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 3911 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3878 3912
3879 /* 3913 /*
3880 * one for deleting orphan item, one for updating inode and 3914 * We need to hold space in order to delete our orphan item once we've
3881 * two for calling btrfs_truncate_inode_items. 3915 * added it, so this takes the reservation so we can release it later
3882 * 3916 * when we are truly done with the orphan item.
3883 * btrfs_truncate_inode_items is a delete operation, it frees
3884 * more space than it uses in most cases. So two units of
3885 * metadata space should be enough for calling it many times.
3886 * If all of the metadata space is used, we can commit
3887 * transaction and use space it freed.
3888 */ 3917 */
3889 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); 3918 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3890 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3919 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3891} 3920}
3892 3921
3893void btrfs_orphan_release_metadata(struct inode *inode) 3922void btrfs_orphan_release_metadata(struct inode *inode)
3894{ 3923{
3895 struct btrfs_root *root = BTRFS_I(inode)->root; 3924 struct btrfs_root *root = BTRFS_I(inode)->root;
3896 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 4); 3925 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3897 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 3926 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3898} 3927}
3899 3928
@@ -4987,6 +5016,15 @@ have_block_group:
4987 if (unlikely(block_group->ro)) 5016 if (unlikely(block_group->ro))
4988 goto loop; 5017 goto loop;
4989 5018
5019 spin_lock(&block_group->free_space_ctl->tree_lock);
5020 if (cached &&
5021 block_group->free_space_ctl->free_space <
5022 num_bytes + empty_size) {
5023 spin_unlock(&block_group->free_space_ctl->tree_lock);
5024 goto loop;
5025 }
5026 spin_unlock(&block_group->free_space_ctl->tree_lock);
5027
4990 /* 5028 /*
4991 * Ok we want to try and use the cluster allocator, so lets look 5029 * Ok we want to try and use the cluster allocator, so lets look
4992 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5030 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -5150,6 +5188,7 @@ checks:
5150 btrfs_add_free_space(block_group, offset, 5188 btrfs_add_free_space(block_group, offset,
5151 search_start - offset); 5189 search_start - offset);
5152 BUG_ON(offset > search_start); 5190 BUG_ON(offset > search_start);
5191 btrfs_put_block_group(block_group);
5153 break; 5192 break;
5154loop: 5193loop:
5155 failed_cluster_refill = false; 5194 failed_cluster_refill = false;
@@ -5242,14 +5281,7 @@ loop:
5242 ret = -ENOSPC; 5281 ret = -ENOSPC;
5243 } else if (!ins->objectid) { 5282 } else if (!ins->objectid) {
5244 ret = -ENOSPC; 5283 ret = -ENOSPC;
5245 } 5284 } else if (ins->objectid) {
5246
5247 /* we found what we needed */
5248 if (ins->objectid) {
5249 if (!(data & BTRFS_BLOCK_GROUP_DATA))
5250 trans->block_group = block_group->key.objectid;
5251
5252 btrfs_put_block_group(block_group);
5253 ret = 0; 5285 ret = 0;
5254 } 5286 }
5255 5287
@@ -6526,7 +6558,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6526 6558
6527 BUG_ON(cache->ro); 6559 BUG_ON(cache->ro);
6528 6560
6529 trans = btrfs_join_transaction(root, 1); 6561 trans = btrfs_join_transaction(root);
6530 BUG_ON(IS_ERR(trans)); 6562 BUG_ON(IS_ERR(trans));
6531 6563
6532 alloc_flags = update_block_group_flags(root, cache->flags); 6564 alloc_flags = update_block_group_flags(root, cache->flags);
@@ -6882,6 +6914,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
6882 path = btrfs_alloc_path(); 6914 path = btrfs_alloc_path();
6883 if (!path) 6915 if (!path)
6884 return -ENOMEM; 6916 return -ENOMEM;
6917 path->reada = 1;
6885 6918
6886 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 6919 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
6887 if (cache_gen != 0 && 6920 if (cache_gen != 0 &&
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c5d9fbb92bc3..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1476,7 +1476,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
1476 if (total_bytes >= max_bytes) 1476 if (total_bytes >= max_bytes)
1477 break; 1477 break;
1478 if (!found) { 1478 if (!found) {
1479 *start = state->start; 1479 *start = max(cur_start, state->start);
1480 found = 1; 1480 found = 1;
1481 } 1481 }
1482 last = state->end; 1482 last = state->end;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c6a22d783c35..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -129,7 +129,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
129 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 129 if (!btrfs_test_opt(root, AUTO_DEFRAG))
130 return 0; 130 return 0;
131 131
132 if (root->fs_info->closing) 132 if (btrfs_fs_closing(root->fs_info))
133 return 0; 133 return 0;
134 134
135 if (BTRFS_I(inode)->in_defrag) 135 if (BTRFS_I(inode)->in_defrag)
@@ -144,7 +144,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
144 if (!defrag) 144 if (!defrag)
145 return -ENOMEM; 145 return -ENOMEM;
146 146
147 defrag->ino = inode->i_ino; 147 defrag->ino = btrfs_ino(inode);
148 defrag->transid = transid; 148 defrag->transid = transid;
149 defrag->root = root->root_key.objectid; 149 defrag->root = root->root_key.objectid;
150 150
@@ -229,7 +229,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
229 first_ino = defrag->ino + 1; 229 first_ino = defrag->ino + 1;
230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes); 230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
231 231
232 if (fs_info->closing) 232 if (btrfs_fs_closing(fs_info))
233 goto next_free; 233 goto next_free;
234 234
235 spin_unlock(&fs_info->defrag_inodes_lock); 235 spin_unlock(&fs_info->defrag_inodes_lock);
@@ -1480,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
1480 * the current transaction, we can bail out now without any 1480 * the current transaction, we can bail out now without any
1481 * syncing 1481 * syncing
1482 */ 1482 */
1483 mutex_lock(&root->fs_info->trans_mutex); 1483 smp_mb();
1484 if (BTRFS_I(inode)->last_trans <= 1484 if (BTRFS_I(inode)->last_trans <=
1485 root->fs_info->last_trans_committed) { 1485 root->fs_info->last_trans_committed) {
1486 BTRFS_I(inode)->last_trans = 0; 1486 BTRFS_I(inode)->last_trans = 0;
1487 mutex_unlock(&root->fs_info->trans_mutex);
1488 goto out; 1487 goto out;
1489 } 1488 }
1490 mutex_unlock(&root->fs_info->trans_mutex);
1491 1489
1492 /* 1490 /*
1493 * ok we haven't committed the transaction yet, lets do a commit 1491 * ok we haven't committed the transaction yet, lets do a commit
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 70d45795d758..ad144736a5fd 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,7 +98,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 98 return inode;
99 99
100 spin_lock(&block_group->lock); 100 spin_lock(&block_group->lock);
101 if (!root->fs_info->closing) { 101 if (!btrfs_fs_closing(root->fs_info)) {
102 block_group->inode = igrab(inode); 102 block_group->inode = igrab(inode);
103 block_group->iref = 1; 103 block_group->iref = 1;
104 } 104 }
@@ -402,7 +402,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
402 spin_lock(&ctl->tree_lock); 402 spin_lock(&ctl->tree_lock);
403 ret = link_free_space(ctl, e); 403 ret = link_free_space(ctl, e);
404 spin_unlock(&ctl->tree_lock); 404 spin_unlock(&ctl->tree_lock);
405 BUG_ON(ret); 405 if (ret) {
406 printk(KERN_ERR "Duplicate entries in "
407 "free space cache, dumping\n");
408 kunmap(page);
409 unlock_page(page);
410 page_cache_release(page);
411 goto free_cache;
412 }
406 } else { 413 } else {
407 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 414 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
408 if (!e->bitmap) { 415 if (!e->bitmap) {
@@ -419,6 +426,14 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
419 ctl->op->recalc_thresholds(ctl); 426 ctl->op->recalc_thresholds(ctl);
420 spin_unlock(&ctl->tree_lock); 427 spin_unlock(&ctl->tree_lock);
421 list_add_tail(&e->list, &bitmaps); 428 list_add_tail(&e->list, &bitmaps);
429 if (ret) {
430 printk(KERN_ERR "Duplicate entries in "
431 "free space cache, dumping\n");
432 kunmap(page);
433 unlock_page(page);
434 page_cache_release(page);
435 goto free_cache;
436 }
422 } 437 }
423 438
424 num_entries--; 439 num_entries--;
@@ -478,8 +493,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
478 * If we're unmounting then just return, since this does a search on the 493 * If we're unmounting then just return, since this does a search on the
479 * normal root and not the commit root and we could deadlock. 494 * normal root and not the commit root and we could deadlock.
480 */ 495 */
481 smp_mb(); 496 if (btrfs_fs_closing(fs_info))
482 if (fs_info->closing)
483 return 0; 497 return 0;
484 498
485 /* 499 /*
@@ -575,10 +589,25 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
575 589
576 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 590 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
577 PAGE_CACHE_SHIFT; 591 PAGE_CACHE_SHIFT;
592
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
578 filemap_write_and_wait(inode->i_mapping); 599 filemap_write_and_wait(inode->i_mapping);
579 btrfs_wait_ordered_range(inode, inode->i_size & 600 btrfs_wait_ordered_range(inode, inode->i_size &
580 ~(root->sectorsize - 1), (u64)-1); 601 ~(root->sectorsize - 1), (u64)-1);
581 602
603 /* make sure we don't overflow that first page */
604 if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
605 /* this is really the same as running out of space, where we also return 0 */
606 printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
607 ret = 0;
608 goto out_update;
609 }
610
582 /* We need a checksum per page. */ 611 /* We need a checksum per page. */
583 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS); 612 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
584 if (!crc) 613 if (!crc)
@@ -590,12 +619,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
590 return -1; 619 return -1;
591 } 620 }
592 621
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
599 /* Get the cluster for this block_group if it exists */ 622 /* Get the cluster for this block_group if it exists */
600 if (block_group && !list_empty(&block_group->cluster_list)) 623 if (block_group && !list_empty(&block_group->cluster_list))
601 cluster = list_entry(block_group->cluster_list.next, 624 cluster = list_entry(block_group->cluster_list.next,
@@ -857,12 +880,14 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 ret = 1; 880 ret = 1;
858 881
859out_free: 882out_free:
883 kfree(checksums);
884 kfree(pages);
885
886out_update:
860 if (ret != 1) { 887 if (ret != 1) {
861 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 888 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
862 BTRFS_I(inode)->generation = 0; 889 BTRFS_I(inode)->generation = 0;
863 } 890 }
864 kfree(checksums);
865 kfree(pages);
866 btrfs_update_inode(trans, root, inode); 891 btrfs_update_inode(trans, root, inode);
867 return ret; 892 return ret;
868} 893}
@@ -963,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
963 * logically. 988 * logically.
964 */ 989 */
965 if (bitmap) { 990 if (bitmap) {
966 WARN_ON(info->bitmap); 991 if (info->bitmap) {
992 WARN_ON_ONCE(1);
993 return -EEXIST;
994 }
967 p = &(*p)->rb_right; 995 p = &(*p)->rb_right;
968 } else { 996 } else {
969 WARN_ON(!info->bitmap); 997 if (!info->bitmap) {
998 WARN_ON_ONCE(1);
999 return -EEXIST;
1000 }
970 p = &(*p)->rb_left; 1001 p = &(*p)->rb_left;
971 } 1002 }
972 } 1003 }
@@ -2481,7 +2512,7 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
2481 return inode; 2512 return inode;
2482 2513
2483 spin_lock(&root->cache_lock); 2514 spin_lock(&root->cache_lock);
2484 if (!root->fs_info->closing) 2515 if (!btrfs_fs_closing(root->fs_info))
2485 root->cache_inode = igrab(inode); 2516 root->cache_inode = igrab(inode);
2486 spin_unlock(&root->cache_lock); 2517 spin_unlock(&root->cache_lock);
2487 2518
@@ -2504,12 +2535,14 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2504 int ret = 0; 2535 int ret = 0;
2505 u64 root_gen = btrfs_root_generation(&root->root_item); 2536 u64 root_gen = btrfs_root_generation(&root->root_item);
2506 2537
2538 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2539 return 0;
2540
2507 /* 2541 /*
2508 * If we're unmounting then just return, since this does a search on the 2542 * If we're unmounting then just return, since this does a search on the
2509 * normal root and not the commit root and we could deadlock. 2543 * normal root and not the commit root and we could deadlock.
2510 */ 2544 */
2511 smp_mb(); 2545 if (btrfs_fs_closing(fs_info))
2512 if (fs_info->closing)
2513 return 0; 2546 return 0;
2514 2547
2515 path = btrfs_alloc_path(); 2548 path = btrfs_alloc_path();
@@ -2543,6 +2576,9 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2543 struct inode *inode; 2576 struct inode *inode;
2544 int ret; 2577 int ret;
2545 2578
2579 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2580 return 0;
2581
2546 inode = lookup_free_ino_inode(root, path); 2582 inode = lookup_free_ino_inode(root, path);
2547 if (IS_ERR(inode)) 2583 if (IS_ERR(inode))
2548 return 0; 2584 return 0;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 3262cd17a12f..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,6 +38,9 @@ static int caching_kthread(void *data)
38 int slot; 38 int slot;
39 int ret; 39 int ret;
40 40
41 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
42 return 0;
43
41 path = btrfs_alloc_path(); 44 path = btrfs_alloc_path();
42 if (!path) 45 if (!path)
43 return -ENOMEM; 46 return -ENOMEM;
@@ -59,8 +62,7 @@ again:
59 goto out; 62 goto out;
60 63
61 while (1) { 64 while (1) {
62 smp_mb(); 65 if (btrfs_fs_closing(fs_info))
63 if (fs_info->closing)
64 goto out; 66 goto out;
65 67
66 leaf = path->nodes[0]; 68 leaf = path->nodes[0];
@@ -141,6 +143,9 @@ static void start_caching(struct btrfs_root *root)
141 int ret; 143 int ret;
142 u64 objectid; 144 u64 objectid;
143 145
146 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
147 return;
148
144 spin_lock(&root->cache_lock); 149 spin_lock(&root->cache_lock);
145 if (root->cached != BTRFS_CACHE_NO) { 150 if (root->cached != BTRFS_CACHE_NO) {
146 spin_unlock(&root->cache_lock); 151 spin_unlock(&root->cache_lock);
@@ -178,6 +183,9 @@ static void start_caching(struct btrfs_root *root)
178 183
179int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) 184int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
180{ 185{
186 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
187 return btrfs_find_free_objectid(root, objectid);
188
181again: 189again:
182 *objectid = btrfs_find_ino_for_alloc(root); 190 *objectid = btrfs_find_ino_for_alloc(root);
183 191
@@ -201,6 +209,10 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
201{ 209{
202 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 210 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
203 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; 211 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
212
213 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
214 return;
215
204again: 216again:
205 if (root->cached == BTRFS_CACHE_FINISHED) { 217 if (root->cached == BTRFS_CACHE_FINISHED) {
206 __btrfs_add_free_space(ctl, objectid, 1); 218 __btrfs_add_free_space(ctl, objectid, 1);
@@ -250,6 +262,9 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
250 struct rb_node *n; 262 struct rb_node *n;
251 u64 count; 263 u64 count;
252 264
265 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
266 return;
267
253 while (1) { 268 while (1) {
254 n = rb_first(rbroot); 269 n = rb_first(rbroot);
255 if (!n) 270 if (!n)
@@ -388,9 +403,24 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
388 int prealloc; 403 int prealloc;
389 bool retry = false; 404 bool retry = false;
390 405
406 /* only fs tree and subvol/snap needs ino cache */
407 if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
408 (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
409 root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
410 return 0;
411
412 /* Don't save inode cache if we are deleting this root */
413 if (btrfs_root_refs(&root->root_item) == 0 &&
414 root != root->fs_info->tree_root)
415 return 0;
416
417 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
418 return 0;
419
391 path = btrfs_alloc_path(); 420 path = btrfs_alloc_path();
392 if (!path) 421 if (!path)
393 return -ENOMEM; 422 return -ENOMEM;
423
394again: 424again:
395 inode = lookup_free_ino_inode(root, path); 425 inode = lookup_free_ino_inode(root, path);
396 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb51bb1fa44f..ebf95f7a44d6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -138,7 +138,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
138 return -ENOMEM; 138 return -ENOMEM;
139 139
140 path->leave_spinning = 1; 140 path->leave_spinning = 1;
141 btrfs_set_trans_block_group(trans, inode);
142 141
143 key.objectid = btrfs_ino(inode); 142 key.objectid = btrfs_ino(inode);
144 key.offset = start; 143 key.offset = start;
@@ -426,9 +425,8 @@ again:
426 } 425 }
427 } 426 }
428 if (start == 0) { 427 if (start == 0) {
429 trans = btrfs_join_transaction(root, 1); 428 trans = btrfs_join_transaction(root);
430 BUG_ON(IS_ERR(trans)); 429 BUG_ON(IS_ERR(trans));
431 btrfs_set_trans_block_group(trans, inode);
432 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 430 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
433 431
434 /* lets try to make an inline extent */ 432 /* lets try to make an inline extent */
@@ -623,8 +621,9 @@ retry:
623 async_extent->start + async_extent->ram_size - 1, 621 async_extent->start + async_extent->ram_size - 1,
624 GFP_NOFS); 622 GFP_NOFS);
625 623
626 trans = btrfs_join_transaction(root, 1); 624 trans = btrfs_join_transaction(root);
627 BUG_ON(IS_ERR(trans)); 625 BUG_ON(IS_ERR(trans));
626 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
628 ret = btrfs_reserve_extent(trans, root, 627 ret = btrfs_reserve_extent(trans, root,
629 async_extent->compressed_size, 628 async_extent->compressed_size,
630 async_extent->compressed_size, 629 async_extent->compressed_size,
@@ -793,9 +792,8 @@ static noinline int cow_file_range(struct inode *inode,
793 int ret = 0; 792 int ret = 0;
794 793
795 BUG_ON(is_free_space_inode(root, inode)); 794 BUG_ON(is_free_space_inode(root, inode));
796 trans = btrfs_join_transaction(root, 1); 795 trans = btrfs_join_transaction(root);
797 BUG_ON(IS_ERR(trans)); 796 BUG_ON(IS_ERR(trans));
798 btrfs_set_trans_block_group(trans, inode);
799 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 797 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
800 798
801 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 799 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
@@ -1077,10 +1075,12 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1077 nolock = is_free_space_inode(root, inode); 1075 nolock = is_free_space_inode(root, inode);
1078 1076
1079 if (nolock) 1077 if (nolock)
1080 trans = btrfs_join_transaction_nolock(root, 1); 1078 trans = btrfs_join_transaction_nolock(root);
1081 else 1079 else
1082 trans = btrfs_join_transaction(root, 1); 1080 trans = btrfs_join_transaction(root);
1081
1083 BUG_ON(IS_ERR(trans)); 1082 BUG_ON(IS_ERR(trans));
1083 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1084 1084
1085 cow_start = (u64)-1; 1085 cow_start = (u64)-1;
1086 cur_offset = start; 1086 cur_offset = start;
@@ -1519,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1519{ 1519{
1520 struct btrfs_ordered_sum *sum; 1520 struct btrfs_ordered_sum *sum;
1521 1521
1522 btrfs_set_trans_block_group(trans, inode);
1523
1524 list_for_each_entry(sum, list, list) { 1522 list_for_each_entry(sum, list, list) {
1525 btrfs_csum_file_blocks(trans, 1523 btrfs_csum_file_blocks(trans,
1526 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1524 BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1735,11 +1733,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1735 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1733 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1736 if (!ret) { 1734 if (!ret) {
1737 if (nolock) 1735 if (nolock)
1738 trans = btrfs_join_transaction_nolock(root, 1); 1736 trans = btrfs_join_transaction_nolock(root);
1739 else 1737 else
1740 trans = btrfs_join_transaction(root, 1); 1738 trans = btrfs_join_transaction(root);
1741 BUG_ON(IS_ERR(trans)); 1739 BUG_ON(IS_ERR(trans));
1742 btrfs_set_trans_block_group(trans, inode);
1743 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1740 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1744 ret = btrfs_update_inode(trans, root, inode); 1741 ret = btrfs_update_inode(trans, root, inode);
1745 BUG_ON(ret); 1742 BUG_ON(ret);
@@ -1752,11 +1749,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1752 0, &cached_state, GFP_NOFS); 1749 0, &cached_state, GFP_NOFS);
1753 1750
1754 if (nolock) 1751 if (nolock)
1755 trans = btrfs_join_transaction_nolock(root, 1); 1752 trans = btrfs_join_transaction_nolock(root);
1756 else 1753 else
1757 trans = btrfs_join_transaction(root, 1); 1754 trans = btrfs_join_transaction(root);
1758 BUG_ON(IS_ERR(trans)); 1755 BUG_ON(IS_ERR(trans));
1759 btrfs_set_trans_block_group(trans, inode);
1760 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1756 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1761 1757
1762 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1758 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2431,7 +2427,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2431 (u64)-1); 2427 (u64)-1);
2432 2428
2433 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2429 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2434 trans = btrfs_join_transaction(root, 1); 2430 trans = btrfs_join_transaction(root);
2435 if (!IS_ERR(trans)) 2431 if (!IS_ERR(trans))
2436 btrfs_end_transaction(trans, root); 2432 btrfs_end_transaction(trans, root);
2437 } 2433 }
@@ -2511,12 +2507,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
2511 struct btrfs_root *root = BTRFS_I(inode)->root; 2507 struct btrfs_root *root = BTRFS_I(inode)->root;
2512 struct btrfs_key location; 2508 struct btrfs_key location;
2513 int maybe_acls; 2509 int maybe_acls;
2514 u64 alloc_group_block;
2515 u32 rdev; 2510 u32 rdev;
2516 int ret; 2511 int ret;
2517 2512
2518 path = btrfs_alloc_path(); 2513 path = btrfs_alloc_path();
2519 BUG_ON(!path); 2514 BUG_ON(!path);
2515 path->leave_spinning = 1;
2520 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2516 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2521 2517
2522 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2518 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2526,6 +2522,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
2526 leaf = path->nodes[0]; 2522 leaf = path->nodes[0];
2527 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2523 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2528 struct btrfs_inode_item); 2524 struct btrfs_inode_item);
2525 if (!leaf->map_token)
2526 map_private_extent_buffer(leaf, (unsigned long)inode_item,
2527 sizeof(struct btrfs_inode_item),
2528 &leaf->map_token, &leaf->kaddr,
2529 &leaf->map_start, &leaf->map_len,
2530 KM_USER1);
2529 2531
2530 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2532 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2531 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2533 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2555,8 +2557,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
2555 BTRFS_I(inode)->index_cnt = (u64)-1; 2557 BTRFS_I(inode)->index_cnt = (u64)-1;
2556 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2558 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2557 2559
2558 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2559
2560 /* 2560 /*
2561 * try to precache a NULL acl entry for files that don't have 2561 * try to precache a NULL acl entry for files that don't have
2562 * any xattrs or acls 2562 * any xattrs or acls
@@ -2566,8 +2566,11 @@ static void btrfs_read_locked_inode(struct inode *inode)
2566 if (!maybe_acls) 2566 if (!maybe_acls)
2567 cache_no_acl(inode); 2567 cache_no_acl(inode);
2568 2568
2569 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2569 if (leaf->map_token) {
2570 alloc_group_block, 0); 2570 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2571 leaf->map_token = NULL;
2572 }
2573
2571 btrfs_free_path(path); 2574 btrfs_free_path(path);
2572 inode_item = NULL; 2575 inode_item = NULL;
2573 2576
@@ -2647,7 +2650,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647 btrfs_set_inode_transid(leaf, item, trans->transid); 2650 btrfs_set_inode_transid(leaf, item, trans->transid);
2648 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2651 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2649 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2652 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2650 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2653 btrfs_set_inode_block_group(leaf, item, 0);
2651 2654
2652 if (leaf->map_token) { 2655 if (leaf->map_token) {
2653 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); 2656 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
@@ -3004,8 +3007,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3004 if (IS_ERR(trans)) 3007 if (IS_ERR(trans))
3005 return PTR_ERR(trans); 3008 return PTR_ERR(trans);
3006 3009
3007 btrfs_set_trans_block_group(trans, dir);
3008
3009 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3010 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3010 3011
3011 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3012 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -3094,8 +3095,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3094 if (IS_ERR(trans)) 3095 if (IS_ERR(trans))
3095 return PTR_ERR(trans); 3096 return PTR_ERR(trans);
3096 3097
3097 btrfs_set_trans_block_group(trans, dir);
3098
3099 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 3098 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3100 err = btrfs_unlink_subvol(trans, root, dir, 3099 err = btrfs_unlink_subvol(trans, root, dir,
3101 BTRFS_I(inode)->location.objectid, 3100 BTRFS_I(inode)->location.objectid,
@@ -3514,7 +3513,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3514 err = PTR_ERR(trans); 3513 err = PTR_ERR(trans);
3515 break; 3514 break;
3516 } 3515 }
3517 btrfs_set_trans_block_group(trans, inode);
3518 3516
3519 err = btrfs_drop_extents(trans, inode, cur_offset, 3517 err = btrfs_drop_extents(trans, inode, cur_offset,
3520 cur_offset + hole_size, 3518 cur_offset + hole_size,
@@ -3650,7 +3648,6 @@ void btrfs_evict_inode(struct inode *inode)
3650 while (1) { 3648 while (1) {
3651 trans = btrfs_start_transaction(root, 0); 3649 trans = btrfs_start_transaction(root, 0);
3652 BUG_ON(IS_ERR(trans)); 3650 BUG_ON(IS_ERR(trans));
3653 btrfs_set_trans_block_group(trans, inode);
3654 trans->block_rsv = root->orphan_block_rsv; 3651 trans->block_rsv = root->orphan_block_rsv;
3655 3652
3656 ret = btrfs_block_rsv_check(trans, root, 3653 ret = btrfs_block_rsv_check(trans, root,
@@ -4133,7 +4130,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4133 path = btrfs_alloc_path(); 4130 path = btrfs_alloc_path();
4134 if (!path) 4131 if (!path)
4135 return -ENOMEM; 4132 return -ENOMEM;
4136 path->reada = 2; 4133
4134 path->reada = 1;
4137 4135
4138 if (key_type == BTRFS_DIR_INDEX_KEY) { 4136 if (key_type == BTRFS_DIR_INDEX_KEY) {
4139 INIT_LIST_HEAD(&ins_list); 4137 INIT_LIST_HEAD(&ins_list);
@@ -4268,18 +4266,16 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4268 if (BTRFS_I(inode)->dummy_inode) 4266 if (BTRFS_I(inode)->dummy_inode)
4269 return 0; 4267 return 0;
4270 4268
4271 smp_mb(); 4269 if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
4272 if (root->fs_info->closing && is_free_space_inode(root, inode))
4273 nolock = true; 4270 nolock = true;
4274 4271
4275 if (wbc->sync_mode == WB_SYNC_ALL) { 4272 if (wbc->sync_mode == WB_SYNC_ALL) {
4276 if (nolock) 4273 if (nolock)
4277 trans = btrfs_join_transaction_nolock(root, 1); 4274 trans = btrfs_join_transaction_nolock(root);
4278 else 4275 else
4279 trans = btrfs_join_transaction(root, 1); 4276 trans = btrfs_join_transaction(root);
4280 if (IS_ERR(trans)) 4277 if (IS_ERR(trans))
4281 return PTR_ERR(trans); 4278 return PTR_ERR(trans);
4282 btrfs_set_trans_block_group(trans, inode);
4283 if (nolock) 4279 if (nolock)
4284 ret = btrfs_end_transaction_nolock(trans, root); 4280 ret = btrfs_end_transaction_nolock(trans, root);
4285 else 4281 else
@@ -4294,7 +4290,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4294 * FIXME, needs more benchmarking...there are no reasons other than performance 4290 * FIXME, needs more benchmarking...there are no reasons other than performance
4295 * to keep or drop this code. 4291 * to keep or drop this code.
4296 */ 4292 */
4297void btrfs_dirty_inode(struct inode *inode) 4293void btrfs_dirty_inode(struct inode *inode, int flags)
4298{ 4294{
4299 struct btrfs_root *root = BTRFS_I(inode)->root; 4295 struct btrfs_root *root = BTRFS_I(inode)->root;
4300 struct btrfs_trans_handle *trans; 4296 struct btrfs_trans_handle *trans;
@@ -4303,9 +4299,8 @@ void btrfs_dirty_inode(struct inode *inode)
4303 if (BTRFS_I(inode)->dummy_inode) 4299 if (BTRFS_I(inode)->dummy_inode)
4304 return; 4300 return;
4305 4301
4306 trans = btrfs_join_transaction(root, 1); 4302 trans = btrfs_join_transaction(root);
4307 BUG_ON(IS_ERR(trans)); 4303 BUG_ON(IS_ERR(trans));
4308 btrfs_set_trans_block_group(trans, inode);
4309 4304
4310 ret = btrfs_update_inode(trans, root, inode); 4305 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) { 4306 if (ret && ret == -ENOSPC) {
@@ -4319,7 +4314,6 @@ void btrfs_dirty_inode(struct inode *inode)
4319 PTR_ERR(trans)); 4314 PTR_ERR(trans));
4320 return; 4315 return;
4321 } 4316 }
4322 btrfs_set_trans_block_group(trans, inode);
4323 4317
4324 ret = btrfs_update_inode(trans, root, inode); 4318 ret = btrfs_update_inode(trans, root, inode);
4325 if (ret) { 4319 if (ret) {
@@ -4418,8 +4412,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4418 struct btrfs_root *root, 4412 struct btrfs_root *root,
4419 struct inode *dir, 4413 struct inode *dir,
4420 const char *name, int name_len, 4414 const char *name, int name_len,
4421 u64 ref_objectid, u64 objectid, 4415 u64 ref_objectid, u64 objectid, int mode,
4422 u64 alloc_hint, int mode, u64 *index) 4416 u64 *index)
4423{ 4417{
4424 struct inode *inode; 4418 struct inode *inode;
4425 struct btrfs_inode_item *inode_item; 4419 struct btrfs_inode_item *inode_item;
@@ -4472,8 +4466,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4472 owner = 0; 4466 owner = 0;
4473 else 4467 else
4474 owner = 1; 4468 owner = 1;
4475 BTRFS_I(inode)->block_group =
4476 btrfs_find_block_group(root, 0, alloc_hint, owner);
4477 4469
4478 key[0].objectid = objectid; 4470 key[0].objectid = objectid;
4479 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4471 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4629,15 +4621,13 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4629 if (IS_ERR(trans)) 4621 if (IS_ERR(trans))
4630 return PTR_ERR(trans); 4622 return PTR_ERR(trans);
4631 4623
4632 btrfs_set_trans_block_group(trans, dir);
4633
4634 err = btrfs_find_free_ino(root, &objectid); 4624 err = btrfs_find_free_ino(root, &objectid);
4635 if (err) 4625 if (err)
4636 goto out_unlock; 4626 goto out_unlock;
4637 4627
4638 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4628 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4639 dentry->d_name.len, btrfs_ino(dir), objectid, 4629 dentry->d_name.len, btrfs_ino(dir), objectid,
4640 BTRFS_I(dir)->block_group, mode, &index); 4630 mode, &index);
4641 if (IS_ERR(inode)) { 4631 if (IS_ERR(inode)) {
4642 err = PTR_ERR(inode); 4632 err = PTR_ERR(inode);
4643 goto out_unlock; 4633 goto out_unlock;
@@ -4649,7 +4639,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4649 goto out_unlock; 4639 goto out_unlock;
4650 } 4640 }
4651 4641
4652 btrfs_set_trans_block_group(trans, inode);
4653 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4642 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4654 if (err) 4643 if (err)
4655 drop_inode = 1; 4644 drop_inode = 1;
@@ -4658,8 +4647,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4658 init_special_inode(inode, inode->i_mode, rdev); 4647 init_special_inode(inode, inode->i_mode, rdev);
4659 btrfs_update_inode(trans, root, inode); 4648 btrfs_update_inode(trans, root, inode);
4660 } 4649 }
4661 btrfs_update_inode_block_group(trans, inode);
4662 btrfs_update_inode_block_group(trans, dir);
4663out_unlock: 4650out_unlock:
4664 nr = trans->blocks_used; 4651 nr = trans->blocks_used;
4665 btrfs_end_transaction_throttle(trans, root); 4652 btrfs_end_transaction_throttle(trans, root);
@@ -4692,15 +4679,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4692 if (IS_ERR(trans)) 4679 if (IS_ERR(trans))
4693 return PTR_ERR(trans); 4680 return PTR_ERR(trans);
4694 4681
4695 btrfs_set_trans_block_group(trans, dir);
4696
4697 err = btrfs_find_free_ino(root, &objectid); 4682 err = btrfs_find_free_ino(root, &objectid);
4698 if (err) 4683 if (err)
4699 goto out_unlock; 4684 goto out_unlock;
4700 4685
4701 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4686 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4702 dentry->d_name.len, btrfs_ino(dir), objectid, 4687 dentry->d_name.len, btrfs_ino(dir), objectid,
4703 BTRFS_I(dir)->block_group, mode, &index); 4688 mode, &index);
4704 if (IS_ERR(inode)) { 4689 if (IS_ERR(inode)) {
4705 err = PTR_ERR(inode); 4690 err = PTR_ERR(inode);
4706 goto out_unlock; 4691 goto out_unlock;
@@ -4712,7 +4697,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4712 goto out_unlock; 4697 goto out_unlock;
4713 } 4698 }
4714 4699
4715 btrfs_set_trans_block_group(trans, inode);
4716 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 4700 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4717 if (err) 4701 if (err)
4718 drop_inode = 1; 4702 drop_inode = 1;
@@ -4723,8 +4707,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 inode->i_op = &btrfs_file_inode_operations; 4707 inode->i_op = &btrfs_file_inode_operations;
4724 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4708 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4725 } 4709 }
4726 btrfs_update_inode_block_group(trans, inode);
4727 btrfs_update_inode_block_group(trans, dir);
4728out_unlock: 4710out_unlock:
4729 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4730 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
@@ -4771,8 +4753,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4771 4753
4772 btrfs_inc_nlink(inode); 4754 btrfs_inc_nlink(inode);
4773 inode->i_ctime = CURRENT_TIME; 4755 inode->i_ctime = CURRENT_TIME;
4774
4775 btrfs_set_trans_block_group(trans, dir);
4776 ihold(inode); 4756 ihold(inode);
4777 4757
4778 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index); 4758 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
@@ -4781,7 +4761,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4781 drop_inode = 1; 4761 drop_inode = 1;
4782 } else { 4762 } else {
4783 struct dentry *parent = dget_parent(dentry); 4763 struct dentry *parent = dget_parent(dentry);
4784 btrfs_update_inode_block_group(trans, dir);
4785 err = btrfs_update_inode(trans, root, inode); 4764 err = btrfs_update_inode(trans, root, inode);
4786 BUG_ON(err); 4765 BUG_ON(err);
4787 btrfs_log_new_name(trans, inode, NULL, parent); 4766 btrfs_log_new_name(trans, inode, NULL, parent);
@@ -4818,7 +4797,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4818 trans = btrfs_start_transaction(root, 5); 4797 trans = btrfs_start_transaction(root, 5);
4819 if (IS_ERR(trans)) 4798 if (IS_ERR(trans))
4820 return PTR_ERR(trans); 4799 return PTR_ERR(trans);
4821 btrfs_set_trans_block_group(trans, dir);
4822 4800
4823 err = btrfs_find_free_ino(root, &objectid); 4801 err = btrfs_find_free_ino(root, &objectid);
4824 if (err) 4802 if (err)
@@ -4826,8 +4804,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4826 4804
4827 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4828 dentry->d_name.len, btrfs_ino(dir), objectid, 4806 dentry->d_name.len, btrfs_ino(dir), objectid,
4829 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4807 S_IFDIR | mode, &index);
4830 &index);
4831 if (IS_ERR(inode)) { 4808 if (IS_ERR(inode)) {
4832 err = PTR_ERR(inode); 4809 err = PTR_ERR(inode);
4833 goto out_fail; 4810 goto out_fail;
@@ -4841,7 +4818,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4841 4818
4842 inode->i_op = &btrfs_dir_inode_operations; 4819 inode->i_op = &btrfs_dir_inode_operations;
4843 inode->i_fop = &btrfs_dir_file_operations; 4820 inode->i_fop = &btrfs_dir_file_operations;
4844 btrfs_set_trans_block_group(trans, inode);
4845 4821
4846 btrfs_i_size_write(inode, 0); 4822 btrfs_i_size_write(inode, 0);
4847 err = btrfs_update_inode(trans, root, inode); 4823 err = btrfs_update_inode(trans, root, inode);
@@ -4855,8 +4831,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4855 4831
4856 d_instantiate(dentry, inode); 4832 d_instantiate(dentry, inode);
4857 drop_on_err = 0; 4833 drop_on_err = 0;
4858 btrfs_update_inode_block_group(trans, inode);
4859 btrfs_update_inode_block_group(trans, dir);
4860 4834
4861out_fail: 4835out_fail:
4862 nr = trans->blocks_used; 4836 nr = trans->blocks_used;
@@ -4989,7 +4963,15 @@ again:
4989 4963
4990 if (!path) { 4964 if (!path) {
4991 path = btrfs_alloc_path(); 4965 path = btrfs_alloc_path();
4992 BUG_ON(!path); 4966 if (!path) {
4967 err = -ENOMEM;
4968 goto out;
4969 }
4970 /*
4971 * Chances are we'll be called again, so go ahead and do
4972 * readahead
4973 */
4974 path->reada = 1;
4993 } 4975 }
4994 4976
4995 ret = btrfs_lookup_file_extent(trans, root, path, 4977 ret = btrfs_lookup_file_extent(trans, root, path,
@@ -5130,8 +5112,10 @@ again:
5130 kunmap(page); 5112 kunmap(page);
5131 free_extent_map(em); 5113 free_extent_map(em);
5132 em = NULL; 5114 em = NULL;
5115
5133 btrfs_release_path(path); 5116 btrfs_release_path(path);
5134 trans = btrfs_join_transaction(root, 1); 5117 trans = btrfs_join_transaction(root);
5118
5135 if (IS_ERR(trans)) 5119 if (IS_ERR(trans))
5136 return ERR_CAST(trans); 5120 return ERR_CAST(trans);
5137 goto again; 5121 goto again;
@@ -5375,7 +5359,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5375 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5359 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5376 } 5360 }
5377 5361
5378 trans = btrfs_join_transaction(root, 0); 5362 trans = btrfs_join_transaction(root);
5379 if (IS_ERR(trans)) 5363 if (IS_ERR(trans))
5380 return ERR_CAST(trans); 5364 return ERR_CAST(trans);
5381 5365
@@ -5611,7 +5595,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5611 * to make sure the current transaction stays open 5595 * to make sure the current transaction stays open
5612 * while we look for nocow cross refs 5596 * while we look for nocow cross refs
5613 */ 5597 */
5614 trans = btrfs_join_transaction(root, 0); 5598 trans = btrfs_join_transaction(root);
5615 if (IS_ERR(trans)) 5599 if (IS_ERR(trans))
5616 goto must_cow; 5600 goto must_cow;
5617 5601
@@ -5750,7 +5734,7 @@ again:
5750 5734
5751 BUG_ON(!ordered); 5735 BUG_ON(!ordered);
5752 5736
5753 trans = btrfs_join_transaction(root, 1); 5737 trans = btrfs_join_transaction(root);
5754 if (IS_ERR(trans)) { 5738 if (IS_ERR(trans)) {
5755 err = -ENOMEM; 5739 err = -ENOMEM;
5756 goto out; 5740 goto out;
@@ -6500,6 +6484,7 @@ out:
6500static int btrfs_truncate(struct inode *inode) 6484static int btrfs_truncate(struct inode *inode)
6501{ 6485{
6502 struct btrfs_root *root = BTRFS_I(inode)->root; 6486 struct btrfs_root *root = BTRFS_I(inode)->root;
6487 struct btrfs_block_rsv *rsv;
6503 int ret; 6488 int ret;
6504 int err = 0; 6489 int err = 0;
6505 struct btrfs_trans_handle *trans; 6490 struct btrfs_trans_handle *trans;
@@ -6513,28 +6498,80 @@ static int btrfs_truncate(struct inode *inode)
6513 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6498 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6514 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6499 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6515 6500
6516 trans = btrfs_start_transaction(root, 5); 6501 /*
6517 if (IS_ERR(trans)) 6502 * Yes ladies and gentelment, this is indeed ugly. The fact is we have
6518 return PTR_ERR(trans); 6503 * 3 things going on here
6504 *
6505 * 1) We need to reserve space for our orphan item and the space to
6506 * delete our orphan item. Lord knows we don't want to have a dangling
6507 * orphan item because we didn't reserve space to remove it.
6508 *
6509 * 2) We need to reserve space to update our inode.
6510 *
6511 * 3) We need to have something to cache all the space that is going to
6512 * be free'd up by the truncate operation, but also have some slack
6513 * space reserved in case it uses space during the truncate (thank you
6514 * very much snapshotting).
6515 *
6516 * And we need these to all be seperate. The fact is we can use alot of
6517 * space doing the truncate, and we have no earthly idea how much space
6518 * we will use, so we need the truncate reservation to be seperate so it
6519 * doesn't end up using space reserved for updating the inode or
6520 * removing the orphan item. We also need to be able to stop the
6521 * transaction and start a new one, which means we need to be able to
6522 * update the inode several times, and we have no idea of knowing how
6523 * many times that will be, so we can't just reserve 1 item for the
6524 * entirety of the opration, so that has to be done seperately as well.
6525 * Then there is the orphan item, which does indeed need to be held on
6526 * to for the whole operation, and we need nobody to touch this reserved
6527 * space except the orphan code.
6528 *
6529 * So that leaves us with
6530 *
6531 * 1) root->orphan_block_rsv - for the orphan deletion.
6532 * 2) rsv - for the truncate reservation, which we will steal from the
6533 * transaction reservation.
6534 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6535 * updating the inode.
6536 */
6537 rsv = btrfs_alloc_block_rsv(root);
6538 if (!rsv)
6539 return -ENOMEM;
6540 btrfs_add_durable_block_rsv(root->fs_info, rsv);
6541
6542 trans = btrfs_start_transaction(root, 4);
6543 if (IS_ERR(trans)) {
6544 err = PTR_ERR(trans);
6545 goto out;
6546 }
6519 6547
6520 btrfs_set_trans_block_group(trans, inode); 6548 /*
6549 * Reserve space for the truncate process. Truncate should be adding
6550 * space, but if there are snapshots it may end up using space.
6551 */
6552 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6553 BUG_ON(ret);
6521 6554
6522 ret = btrfs_orphan_add(trans, inode); 6555 ret = btrfs_orphan_add(trans, inode);
6523 if (ret) { 6556 if (ret) {
6524 btrfs_end_transaction(trans, root); 6557 btrfs_end_transaction(trans, root);
6525 return ret; 6558 goto out;
6526 } 6559 }
6527 6560
6528 nr = trans->blocks_used; 6561 nr = trans->blocks_used;
6529 btrfs_end_transaction(trans, root); 6562 btrfs_end_transaction(trans, root);
6530 btrfs_btree_balance_dirty(root, nr); 6563 btrfs_btree_balance_dirty(root, nr);
6531 6564
6532 /* Now start a transaction for the truncate */ 6565 /*
6533 trans = btrfs_start_transaction(root, 0); 6566 * Ok so we've already migrated our bytes over for the truncate, so here
6534 if (IS_ERR(trans)) 6567 * just reserve the one slot we need for updating the inode.
6535 return PTR_ERR(trans); 6568 */
6536 btrfs_set_trans_block_group(trans, inode); 6569 trans = btrfs_start_transaction(root, 1);
6537 trans->block_rsv = root->orphan_block_rsv; 6570 if (IS_ERR(trans)) {
6571 err = PTR_ERR(trans);
6572 goto out;
6573 }
6574 trans->block_rsv = rsv;
6538 6575
6539 /* 6576 /*
6540 * setattr is responsible for setting the ordered_data_close flag, 6577 * setattr is responsible for setting the ordered_data_close flag,
@@ -6558,24 +6595,17 @@ static int btrfs_truncate(struct inode *inode)
6558 6595
6559 while (1) { 6596 while (1) {
6560 if (!trans) { 6597 if (!trans) {
6561 trans = btrfs_start_transaction(root, 0); 6598 trans = btrfs_start_transaction(root, 3);
6562 if (IS_ERR(trans)) 6599 if (IS_ERR(trans)) {
6563 return PTR_ERR(trans); 6600 err = PTR_ERR(trans);
6564 btrfs_set_trans_block_group(trans, inode); 6601 goto out;
6565 trans->block_rsv = root->orphan_block_rsv; 6602 }
6566 }
6567 6603
6568 ret = btrfs_block_rsv_check(trans, root, 6604 ret = btrfs_truncate_reserve_metadata(trans, root,
6569 root->orphan_block_rsv, 0, 5); 6605 rsv);
6570 if (ret == -EAGAIN) { 6606 BUG_ON(ret);
6571 ret = btrfs_commit_transaction(trans, root); 6607
6572 if (ret) 6608 trans->block_rsv = rsv;
6573 return ret;
6574 trans = NULL;
6575 continue;
6576 } else if (ret) {
6577 err = ret;
6578 break;
6579 } 6609 }
6580 6610
6581 ret = btrfs_truncate_inode_items(trans, root, inode, 6611 ret = btrfs_truncate_inode_items(trans, root, inode,
@@ -6586,6 +6616,7 @@ static int btrfs_truncate(struct inode *inode)
6586 break; 6616 break;
6587 } 6617 }
6588 6618
6619 trans->block_rsv = &root->fs_info->trans_block_rsv;
6589 ret = btrfs_update_inode(trans, root, inode); 6620 ret = btrfs_update_inode(trans, root, inode);
6590 if (ret) { 6621 if (ret) {
6591 err = ret; 6622 err = ret;
@@ -6599,6 +6630,7 @@ static int btrfs_truncate(struct inode *inode)
6599 } 6630 }
6600 6631
6601 if (ret == 0 && inode->i_nlink > 0) { 6632 if (ret == 0 && inode->i_nlink > 0) {
6633 trans->block_rsv = root->orphan_block_rsv;
6602 ret = btrfs_orphan_del(trans, inode); 6634 ret = btrfs_orphan_del(trans, inode);
6603 if (ret) 6635 if (ret)
6604 err = ret; 6636 err = ret;
@@ -6610,15 +6642,20 @@ static int btrfs_truncate(struct inode *inode)
6610 ret = btrfs_orphan_del(NULL, inode); 6642 ret = btrfs_orphan_del(NULL, inode);
6611 } 6643 }
6612 6644
6645 trans->block_rsv = &root->fs_info->trans_block_rsv;
6613 ret = btrfs_update_inode(trans, root, inode); 6646 ret = btrfs_update_inode(trans, root, inode);
6614 if (ret && !err) 6647 if (ret && !err)
6615 err = ret; 6648 err = ret;
6616 6649
6617 nr = trans->blocks_used; 6650 nr = trans->blocks_used;
6618 ret = btrfs_end_transaction_throttle(trans, root); 6651 ret = btrfs_end_transaction_throttle(trans, root);
6652 btrfs_btree_balance_dirty(root, nr);
6653
6654out:
6655 btrfs_free_block_rsv(root, rsv);
6656
6619 if (ret && !err) 6657 if (ret && !err)
6620 err = ret; 6658 err = ret;
6621 btrfs_btree_balance_dirty(root, nr);
6622 6659
6623 return err; 6660 return err;
6624} 6661}
@@ -6627,15 +6664,14 @@ static int btrfs_truncate(struct inode *inode)
6627 * create a new subvolume directory/inode (helper for the ioctl). 6664 * create a new subvolume directory/inode (helper for the ioctl).
6628 */ 6665 */
6629int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6666int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6630 struct btrfs_root *new_root, 6667 struct btrfs_root *new_root, u64 new_dirid)
6631 u64 new_dirid, u64 alloc_hint)
6632{ 6668{
6633 struct inode *inode; 6669 struct inode *inode;
6634 int err; 6670 int err;
6635 u64 index = 0; 6671 u64 index = 0;
6636 6672
6637 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 6673 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
6638 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 6674 new_dirid, S_IFDIR | 0700, &index);
6639 if (IS_ERR(inode)) 6675 if (IS_ERR(inode))
6640 return PTR_ERR(inode); 6676 return PTR_ERR(inode);
6641 inode->i_op = &btrfs_dir_inode_operations; 6677 inode->i_op = &btrfs_dir_inode_operations;
@@ -6748,21 +6784,6 @@ void btrfs_destroy_inode(struct inode *inode)
6748 spin_unlock(&root->fs_info->ordered_extent_lock); 6784 spin_unlock(&root->fs_info->ordered_extent_lock);
6749 } 6785 }
6750 6786
6751 if (root == root->fs_info->tree_root) {
6752 struct btrfs_block_group_cache *block_group;
6753
6754 block_group = btrfs_lookup_block_group(root->fs_info,
6755 BTRFS_I(inode)->block_group);
6756 if (block_group && block_group->inode == inode) {
6757 spin_lock(&block_group->lock);
6758 block_group->inode = NULL;
6759 spin_unlock(&block_group->lock);
6760 btrfs_put_block_group(block_group);
6761 } else if (block_group) {
6762 btrfs_put_block_group(block_group);
6763 }
6764 }
6765
6766 spin_lock(&root->orphan_lock); 6787 spin_lock(&root->orphan_lock);
6767 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6788 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
6768 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n", 6789 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
@@ -6948,8 +6969,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6948 goto out_notrans; 6969 goto out_notrans;
6949 } 6970 }
6950 6971
6951 btrfs_set_trans_block_group(trans, new_dir);
6952
6953 if (dest != root) 6972 if (dest != root)
6954 btrfs_record_root_in_trans(trans, dest); 6973 btrfs_record_root_in_trans(trans, dest);
6955 6974
@@ -7131,16 +7150,13 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7131 if (IS_ERR(trans)) 7150 if (IS_ERR(trans))
7132 return PTR_ERR(trans); 7151 return PTR_ERR(trans);
7133 7152
7134 btrfs_set_trans_block_group(trans, dir);
7135
7136 err = btrfs_find_free_ino(root, &objectid); 7153 err = btrfs_find_free_ino(root, &objectid);
7137 if (err) 7154 if (err)
7138 goto out_unlock; 7155 goto out_unlock;
7139 7156
7140 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7157 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
7141 dentry->d_name.len, btrfs_ino(dir), objectid, 7158 dentry->d_name.len, btrfs_ino(dir), objectid,
7142 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 7159 S_IFLNK|S_IRWXUGO, &index);
7143 &index);
7144 if (IS_ERR(inode)) { 7160 if (IS_ERR(inode)) {
7145 err = PTR_ERR(inode); 7161 err = PTR_ERR(inode);
7146 goto out_unlock; 7162 goto out_unlock;
@@ -7152,7 +7168,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7152 goto out_unlock; 7168 goto out_unlock;
7153 } 7169 }
7154 7170
7155 btrfs_set_trans_block_group(trans, inode);
7156 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 7171 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
7157 if (err) 7172 if (err)
7158 drop_inode = 1; 7173 drop_inode = 1;
@@ -7163,8 +7178,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7163 inode->i_op = &btrfs_file_inode_operations; 7178 inode->i_op = &btrfs_file_inode_operations;
7164 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7179 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
7165 } 7180 }
7166 btrfs_update_inode_block_group(trans, inode);
7167 btrfs_update_inode_block_group(trans, dir);
7168 if (drop_inode) 7181 if (drop_inode)
7169 goto out_unlock; 7182 goto out_unlock;
7170 7183
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 85e818ce00c5..ac37040e426a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -243,7 +243,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
243 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 243 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
244 } 244 }
245 245
246 trans = btrfs_join_transaction(root, 1); 246 trans = btrfs_join_transaction(root);
247 BUG_ON(IS_ERR(trans)); 247 BUG_ON(IS_ERR(trans));
248 248
249 ret = btrfs_update_inode(trans, root, inode); 249 ret = btrfs_update_inode(trans, root, inode);
@@ -414,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
414 414
415 btrfs_record_root_in_trans(trans, new_root); 415 btrfs_record_root_in_trans(trans, new_root);
416 416
417 ret = btrfs_create_subvol_root(trans, new_root, new_dirid, 417 ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
418 BTRFS_I(dir)->block_group);
419 /* 418 /*
420 * insert the directory item 419 * insert the directory item
421 */ 420 */
@@ -707,16 +706,17 @@ static int find_new_extents(struct btrfs_root *root,
707 struct btrfs_file_extent_item *extent; 706 struct btrfs_file_extent_item *extent;
708 int type; 707 int type;
709 int ret; 708 int ret;
709 u64 ino = btrfs_ino(inode);
710 710
711 path = btrfs_alloc_path(); 711 path = btrfs_alloc_path();
712 if (!path) 712 if (!path)
713 return -ENOMEM; 713 return -ENOMEM;
714 714
715 min_key.objectid = inode->i_ino; 715 min_key.objectid = ino;
716 min_key.type = BTRFS_EXTENT_DATA_KEY; 716 min_key.type = BTRFS_EXTENT_DATA_KEY;
717 min_key.offset = *off; 717 min_key.offset = *off;
718 718
719 max_key.objectid = inode->i_ino; 719 max_key.objectid = ino;
720 max_key.type = (u8)-1; 720 max_key.type = (u8)-1;
721 max_key.offset = (u64)-1; 721 max_key.offset = (u64)-1;
722 722
@@ -727,7 +727,7 @@ static int find_new_extents(struct btrfs_root *root,
727 path, 0, newer_than); 727 path, 0, newer_than);
728 if (ret != 0) 728 if (ret != 0)
729 goto none; 729 goto none;
730 if (min_key.objectid != inode->i_ino) 730 if (min_key.objectid != ino)
731 goto none; 731 goto none;
732 if (min_key.type != BTRFS_EXTENT_DATA_KEY) 732 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
733 goto none; 733 goto none;
@@ -2489,12 +2489,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
2489 if (ret) 2489 if (ret)
2490 goto out; 2490 goto out;
2491 2491
2492 mutex_lock(&root->fs_info->trans_mutex); 2492 atomic_inc(&root->fs_info->open_ioctl_trans);
2493 root->fs_info->open_ioctl_trans++;
2494 mutex_unlock(&root->fs_info->trans_mutex);
2495 2493
2496 ret = -ENOMEM; 2494 ret = -ENOMEM;
2497 trans = btrfs_start_ioctl_transaction(root, 0); 2495 trans = btrfs_start_ioctl_transaction(root);
2498 if (IS_ERR(trans)) 2496 if (IS_ERR(trans))
2499 goto out_drop; 2497 goto out_drop;
2500 2498
@@ -2502,9 +2500,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
2502 return 0; 2500 return 0;
2503 2501
2504out_drop: 2502out_drop:
2505 mutex_lock(&root->fs_info->trans_mutex); 2503 atomic_dec(&root->fs_info->open_ioctl_trans);
2506 root->fs_info->open_ioctl_trans--;
2507 mutex_unlock(&root->fs_info->trans_mutex);
2508 mnt_drop_write(file->f_path.mnt); 2504 mnt_drop_write(file->f_path.mnt);
2509out: 2505out:
2510 return ret; 2506 return ret;
@@ -2738,9 +2734,7 @@ long btrfs_ioctl_trans_end(struct file *file)
2738 2734
2739 btrfs_end_transaction(trans, root); 2735 btrfs_end_transaction(trans, root);
2740 2736
2741 mutex_lock(&root->fs_info->trans_mutex); 2737 atomic_dec(&root->fs_info->open_ioctl_trans);
2742 root->fs_info->open_ioctl_trans--;
2743 mutex_unlock(&root->fs_info->trans_mutex);
2744 2738
2745 mnt_drop_write(file->f_path.mnt); 2739 mnt_drop_write(file->f_path.mnt);
2746 return 0; 2740 return 0;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ca38eca70af0..b1ef27cc673b 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
677 err = -ENOMEM; 677 err = -ENOMEM;
678 goto out; 678 goto out;
679 } 679 }
680 path1->reada = 1;
681 path2->reada = 2;
680 682
681 node = alloc_backref_node(cache); 683 node = alloc_backref_node(cache);
682 if (!node) { 684 if (!node) {
@@ -1999,6 +2001,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1999 path = btrfs_alloc_path(); 2001 path = btrfs_alloc_path();
2000 if (!path) 2002 if (!path)
2001 return -ENOMEM; 2003 return -ENOMEM;
2004 path->reada = 1;
2002 2005
2003 reloc_root = root->reloc_root; 2006 reloc_root = root->reloc_root;
2004 root_item = &reloc_root->root_item; 2007 root_item = &reloc_root->root_item;
@@ -2139,10 +2142,10 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2139 u64 num_bytes = 0; 2142 u64 num_bytes = 0;
2140 int ret; 2143 int ret;
2141 2144
2142 mutex_lock(&root->fs_info->trans_mutex); 2145 spin_lock(&root->fs_info->trans_lock);
2143 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2146 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2144 rc->merging_rsv_size += rc->nodes_relocated * 2; 2147 rc->merging_rsv_size += rc->nodes_relocated * 2;
2145 mutex_unlock(&root->fs_info->trans_mutex); 2148 spin_unlock(&root->fs_info->trans_lock);
2146again: 2149again:
2147 if (!err) { 2150 if (!err) {
2148 num_bytes = rc->merging_rsv_size; 2151 num_bytes = rc->merging_rsv_size;
@@ -2152,7 +2155,7 @@ again:
2152 err = ret; 2155 err = ret;
2153 } 2156 }
2154 2157
2155 trans = btrfs_join_transaction(rc->extent_root, 1); 2158 trans = btrfs_join_transaction(rc->extent_root);
2156 if (IS_ERR(trans)) { 2159 if (IS_ERR(trans)) {
2157 if (!err) 2160 if (!err)
2158 btrfs_block_rsv_release(rc->extent_root, 2161 btrfs_block_rsv_release(rc->extent_root,
@@ -2211,9 +2214,9 @@ int merge_reloc_roots(struct reloc_control *rc)
2211 int ret; 2214 int ret;
2212again: 2215again:
2213 root = rc->extent_root; 2216 root = rc->extent_root;
2214 mutex_lock(&root->fs_info->trans_mutex); 2217 spin_lock(&root->fs_info->trans_lock);
2215 list_splice_init(&rc->reloc_roots, &reloc_roots); 2218 list_splice_init(&rc->reloc_roots, &reloc_roots);
2216 mutex_unlock(&root->fs_info->trans_mutex); 2219 spin_unlock(&root->fs_info->trans_lock);
2217 2220
2218 while (!list_empty(&reloc_roots)) { 2221 while (!list_empty(&reloc_roots)) {
2219 found = 1; 2222 found = 1;
@@ -3236,7 +3239,7 @@ truncate:
3236 goto out; 3239 goto out;
3237 } 3240 }
3238 3241
3239 trans = btrfs_join_transaction(root, 0); 3242 trans = btrfs_join_transaction(root);
3240 if (IS_ERR(trans)) { 3243 if (IS_ERR(trans)) {
3241 btrfs_free_path(path); 3244 btrfs_free_path(path);
3242 ret = PTR_ERR(trans); 3245 ret = PTR_ERR(trans);
@@ -3300,6 +3303,7 @@ static int find_data_references(struct reloc_control *rc,
3300 path = btrfs_alloc_path(); 3303 path = btrfs_alloc_path();
3301 if (!path) 3304 if (!path)
3302 return -ENOMEM; 3305 return -ENOMEM;
3306 path->reada = 1;
3303 3307
3304 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3308 root = read_fs_root(rc->extent_root->fs_info, ref_root);
3305 if (IS_ERR(root)) { 3309 if (IS_ERR(root)) {
@@ -3586,17 +3590,17 @@ next:
3586static void set_reloc_control(struct reloc_control *rc) 3590static void set_reloc_control(struct reloc_control *rc)
3587{ 3591{
3588 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3592 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3589 mutex_lock(&fs_info->trans_mutex); 3593 spin_lock(&fs_info->trans_lock);
3590 fs_info->reloc_ctl = rc; 3594 fs_info->reloc_ctl = rc;
3591 mutex_unlock(&fs_info->trans_mutex); 3595 spin_unlock(&fs_info->trans_lock);
3592} 3596}
3593 3597
3594static void unset_reloc_control(struct reloc_control *rc) 3598static void unset_reloc_control(struct reloc_control *rc)
3595{ 3599{
3596 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3600 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3597 mutex_lock(&fs_info->trans_mutex); 3601 spin_lock(&fs_info->trans_lock);
3598 fs_info->reloc_ctl = NULL; 3602 fs_info->reloc_ctl = NULL;
3599 mutex_unlock(&fs_info->trans_mutex); 3603 spin_unlock(&fs_info->trans_lock);
3600} 3604}
3601 3605
3602static int check_extent_flags(u64 flags) 3606static int check_extent_flags(u64 flags)
@@ -3645,7 +3649,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 rc->create_reloc_tree = 1; 3649 rc->create_reloc_tree = 1;
3646 set_reloc_control(rc); 3650 set_reloc_control(rc);
3647 3651
3648 trans = btrfs_join_transaction(rc->extent_root, 1); 3652 trans = btrfs_join_transaction(rc->extent_root);
3649 BUG_ON(IS_ERR(trans)); 3653 BUG_ON(IS_ERR(trans));
3650 btrfs_commit_transaction(trans, rc->extent_root); 3654 btrfs_commit_transaction(trans, rc->extent_root);
3651 return 0; 3655 return 0;
@@ -3668,6 +3672,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3668 path = btrfs_alloc_path(); 3672 path = btrfs_alloc_path();
3669 if (!path) 3673 if (!path)
3670 return -ENOMEM; 3674 return -ENOMEM;
3675 path->reada = 1;
3671 3676
3672 ret = prepare_to_relocate(rc); 3677 ret = prepare_to_relocate(rc);
3673 if (ret) { 3678 if (ret) {
@@ -3834,7 +3839,7 @@ restart:
3834 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); 3839 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3835 3840
3836 /* get rid of pinned extents */ 3841 /* get rid of pinned extents */
3837 trans = btrfs_join_transaction(rc->extent_root, 1); 3842 trans = btrfs_join_transaction(rc->extent_root);
3838 if (IS_ERR(trans)) 3843 if (IS_ERR(trans))
3839 err = PTR_ERR(trans); 3844 err = PTR_ERR(trans);
3840 else 3845 else
@@ -4093,6 +4098,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4093 path = btrfs_alloc_path(); 4098 path = btrfs_alloc_path();
4094 if (!path) 4099 if (!path)
4095 return -ENOMEM; 4100 return -ENOMEM;
4101 path->reada = -1;
4096 4102
4097 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4103 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4098 key.type = BTRFS_ROOT_ITEM_KEY; 4104 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4159,7 +4165,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4159 4165
4160 set_reloc_control(rc); 4166 set_reloc_control(rc);
4161 4167
4162 trans = btrfs_join_transaction(rc->extent_root, 1); 4168 trans = btrfs_join_transaction(rc->extent_root);
4163 if (IS_ERR(trans)) { 4169 if (IS_ERR(trans)) {
4164 unset_reloc_control(rc); 4170 unset_reloc_control(rc);
4165 err = PTR_ERR(trans); 4171 err = PTR_ERR(trans);
@@ -4193,7 +4199,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4193 4199
4194 unset_reloc_control(rc); 4200 unset_reloc_control(rc);
4195 4201
4196 trans = btrfs_join_transaction(rc->extent_root, 1); 4202 trans = btrfs_join_transaction(rc->extent_root);
4197 if (IS_ERR(trans)) 4203 if (IS_ERR(trans))
4198 err = PTR_ERR(trans); 4204 err = PTR_ERR(trans);
4199 else 4205 else
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 6dfed0c27ac3..df50fd1eca8f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -117,33 +117,37 @@ static void scrub_free_csums(struct scrub_dev *sdev)
117 } 117 }
118} 118}
119 119
120static void scrub_free_bio(struct bio *bio)
121{
122 int i;
123 struct page *last_page = NULL;
124
125 if (!bio)
126 return;
127
128 for (i = 0; i < bio->bi_vcnt; ++i) {
129 if (bio->bi_io_vec[i].bv_page == last_page)
130 continue;
131 last_page = bio->bi_io_vec[i].bv_page;
132 __free_page(last_page);
133 }
134 bio_put(bio);
135}
136
120static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev) 137static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
121{ 138{
122 int i; 139 int i;
123 int j;
124 struct page *last_page;
125 140
126 if (!sdev) 141 if (!sdev)
127 return; 142 return;
128 143
129 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 144 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
130 struct scrub_bio *sbio = sdev->bios[i]; 145 struct scrub_bio *sbio = sdev->bios[i];
131 struct bio *bio;
132 146
133 if (!sbio) 147 if (!sbio)
134 break; 148 break;
135 149
136 bio = sbio->bio; 150 scrub_free_bio(sbio->bio);
137 if (bio) {
138 last_page = NULL;
139 for (j = 0; j < bio->bi_vcnt; ++j) {
140 if (bio->bi_io_vec[j].bv_page == last_page)
141 continue;
142 last_page = bio->bi_io_vec[j].bv_page;
143 __free_page(last_page);
144 }
145 bio_put(bio);
146 }
147 kfree(sbio); 151 kfree(sbio);
148 } 152 }
149 153
@@ -156,8 +160,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
156{ 160{
157 struct scrub_dev *sdev; 161 struct scrub_dev *sdev;
158 int i; 162 int i;
159 int j;
160 int ret;
161 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; 163 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
162 164
163 sdev = kzalloc(sizeof(*sdev), GFP_NOFS); 165 sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
@@ -165,7 +167,6 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
165 goto nomem; 167 goto nomem;
166 sdev->dev = dev; 168 sdev->dev = dev;
167 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) { 169 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
168 struct bio *bio;
169 struct scrub_bio *sbio; 170 struct scrub_bio *sbio;
170 171
171 sbio = kzalloc(sizeof(*sbio), GFP_NOFS); 172 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
@@ -173,32 +174,10 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
173 goto nomem; 174 goto nomem;
174 sdev->bios[i] = sbio; 175 sdev->bios[i] = sbio;
175 176
176 bio = bio_kmalloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
177 if (!bio)
178 goto nomem;
179
180 sbio->index = i; 177 sbio->index = i;
181 sbio->sdev = sdev; 178 sbio->sdev = sdev;
182 sbio->bio = bio;
183 sbio->count = 0; 179 sbio->count = 0;
184 sbio->work.func = scrub_checksum; 180 sbio->work.func = scrub_checksum;
185 bio->bi_private = sdev->bios[i];
186 bio->bi_end_io = scrub_bio_end_io;
187 bio->bi_sector = 0;
188 bio->bi_bdev = dev->bdev;
189 bio->bi_size = 0;
190
191 for (j = 0; j < SCRUB_PAGES_PER_BIO; ++j) {
192 struct page *page;
193 page = alloc_page(GFP_NOFS);
194 if (!page)
195 goto nomem;
196
197 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
198 if (!ret)
199 goto nomem;
200 }
201 WARN_ON(bio->bi_vcnt != SCRUB_PAGES_PER_BIO);
202 181
203 if (i != SCRUB_BIOS_PER_DEV-1) 182 if (i != SCRUB_BIOS_PER_DEV-1)
204 sdev->bios[i]->next_free = i + 1; 183 sdev->bios[i]->next_free = i + 1;
@@ -369,9 +348,6 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
369 int ret; 348 int ret;
370 DECLARE_COMPLETION_ONSTACK(complete); 349 DECLARE_COMPLETION_ONSTACK(complete);
371 350
372 /* we are going to wait on this IO */
373 rw |= REQ_SYNC;
374
375 bio = bio_alloc(GFP_NOFS, 1); 351 bio = bio_alloc(GFP_NOFS, 1);
376 bio->bi_bdev = bdev; 352 bio->bi_bdev = bdev;
377 bio->bi_sector = sector; 353 bio->bi_sector = sector;
@@ -380,6 +356,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
380 bio->bi_private = &complete; 356 bio->bi_private = &complete;
381 submit_bio(rw, bio); 357 submit_bio(rw, bio);
382 358
359 /* this will also unplug the queue */
383 wait_for_completion(&complete); 360 wait_for_completion(&complete);
384 361
385 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags); 362 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -394,6 +371,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
394 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 371 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
395 372
396 sbio->err = err; 373 sbio->err = err;
374 sbio->bio = bio;
397 375
398 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work); 376 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
399} 377}
@@ -453,6 +431,8 @@ static void scrub_checksum(struct btrfs_work *work)
453 } 431 }
454 432
455out: 433out:
434 scrub_free_bio(sbio->bio);
435 sbio->bio = NULL;
456 spin_lock(&sdev->list_lock); 436 spin_lock(&sdev->list_lock);
457 sbio->next_free = sdev->first_free; 437 sbio->next_free = sdev->first_free;
458 sdev->first_free = sbio->index; 438 sdev->first_free = sbio->index;
@@ -583,25 +563,50 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
583static int scrub_submit(struct scrub_dev *sdev) 563static int scrub_submit(struct scrub_dev *sdev)
584{ 564{
585 struct scrub_bio *sbio; 565 struct scrub_bio *sbio;
566 struct bio *bio;
567 int i;
586 568
587 if (sdev->curr == -1) 569 if (sdev->curr == -1)
588 return 0; 570 return 0;
589 571
590 sbio = sdev->bios[sdev->curr]; 572 sbio = sdev->bios[sdev->curr];
591 573
592 sbio->bio->bi_sector = sbio->physical >> 9; 574 bio = bio_alloc(GFP_NOFS, sbio->count);
593 sbio->bio->bi_size = sbio->count * PAGE_SIZE; 575 if (!bio)
594 sbio->bio->bi_next = NULL; 576 goto nomem;
595 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 577
596 sbio->bio->bi_comp_cpu = -1; 578 bio->bi_private = sbio;
597 sbio->bio->bi_bdev = sdev->dev->bdev; 579 bio->bi_end_io = scrub_bio_end_io;
580 bio->bi_bdev = sdev->dev->bdev;
581 bio->bi_sector = sbio->physical >> 9;
582
583 for (i = 0; i < sbio->count; ++i) {
584 struct page *page;
585 int ret;
586
587 page = alloc_page(GFP_NOFS);
588 if (!page)
589 goto nomem;
590
591 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
592 if (!ret) {
593 __free_page(page);
594 goto nomem;
595 }
596 }
597
598 sbio->err = 0; 598 sbio->err = 0;
599 sdev->curr = -1; 599 sdev->curr = -1;
600 atomic_inc(&sdev->in_flight); 600 atomic_inc(&sdev->in_flight);
601 601
602 submit_bio(0, sbio->bio); 602 submit_bio(READ, bio);
603 603
604 return 0; 604 return 0;
605
606nomem:
607 scrub_free_bio(bio);
608
609 return -ENOMEM;
605} 610}
606 611
607static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 612static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
@@ -633,7 +638,11 @@ again:
633 sbio->logical = logical; 638 sbio->logical = logical;
634 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 639 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
635 sbio->logical + sbio->count * PAGE_SIZE != logical) { 640 sbio->logical + sbio->count * PAGE_SIZE != logical) {
636 scrub_submit(sdev); 641 int ret;
642
643 ret = scrub_submit(sdev);
644 if (ret)
645 return ret;
637 goto again; 646 goto again;
638 } 647 }
639 sbio->spag[sbio->count].flags = flags; 648 sbio->spag[sbio->count].flags = flags;
@@ -645,8 +654,13 @@ again:
645 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 654 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
646 } 655 }
647 ++sbio->count; 656 ++sbio->count;
648 if (sbio->count == SCRUB_PAGES_PER_BIO || force) 657 if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
649 scrub_submit(sdev); 658 int ret;
659
660 ret = scrub_submit(sdev);
661 if (ret)
662 return ret;
663 }
650 664
651 return 0; 665 return 0;
652} 666}
@@ -727,6 +741,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
727 struct btrfs_root *root = fs_info->extent_root; 741 struct btrfs_root *root = fs_info->extent_root;
728 struct btrfs_root *csum_root = fs_info->csum_root; 742 struct btrfs_root *csum_root = fs_info->csum_root;
729 struct btrfs_extent_item *extent; 743 struct btrfs_extent_item *extent;
744 struct blk_plug plug;
730 u64 flags; 745 u64 flags;
731 int ret; 746 int ret;
732 int slot; 747 int slot;
@@ -831,6 +846,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
831 * the scrub. This might currently (crc32) end up to be about 1MB 846 * the scrub. This might currently (crc32) end up to be about 1MB
832 */ 847 */
833 start_stripe = 0; 848 start_stripe = 0;
849 blk_start_plug(&plug);
834again: 850again:
835 logical = base + offset + start_stripe * increment; 851 logical = base + offset + start_stripe * increment;
836 for (i = start_stripe; i < nstripes; ++i) { 852 for (i = start_stripe; i < nstripes; ++i) {
@@ -972,6 +988,7 @@ next:
972 scrub_submit(sdev); 988 scrub_submit(sdev);
973 989
974out: 990out:
991 blk_finish_plug(&plug);
975 btrfs_free_path(path); 992 btrfs_free_path(path);
976 return ret < 0 ? ret : 0; 993 return ret < 0 ? ret : 0;
977} 994}
@@ -1166,7 +1183,7 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1166 int ret; 1183 int ret;
1167 struct btrfs_device *dev; 1184 struct btrfs_device *dev;
1168 1185
1169 if (root->fs_info->closing) 1186 if (btrfs_fs_closing(root->fs_info))
1170 return -EINVAL; 1187 return -EINVAL;
1171 1188
1172 /* 1189 /*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9b2e7e5bc3ef..117e74e3604b 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -161,7 +161,8 @@ enum {
161 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 161 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_err, 164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err,
165}; 166};
166 167
167static match_table_t tokens = { 168static match_table_t tokens = {
@@ -193,6 +194,7 @@ static match_table_t tokens = {
193 {Opt_enospc_debug, "enospc_debug"}, 194 {Opt_enospc_debug, "enospc_debug"},
194 {Opt_subvolrootid, "subvolrootid=%d"}, 195 {Opt_subvolrootid, "subvolrootid=%d"},
195 {Opt_defrag, "autodefrag"}, 196 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"},
196 {Opt_err, NULL}, 198 {Opt_err, NULL},
197}; 199};
198 200
@@ -361,6 +363,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
361 printk(KERN_INFO "btrfs: enabling disk space caching\n"); 363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
362 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 364 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
363 break; 365 break;
366 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
369 break;
364 case Opt_clear_cache: 370 case Opt_clear_cache:
365 printk(KERN_INFO "btrfs: force clearing of disk cache\n"); 371 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
366 btrfs_set_opt(info->mount_opt, CLEAR_CACHE); 372 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index dc80f7156923..dd719662340e 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,6 +35,7 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
35{ 35{
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list));
38 memset(transaction, 0, sizeof(*transaction)); 39 memset(transaction, 0, sizeof(*transaction));
39 kmem_cache_free(btrfs_transaction_cachep, transaction); 40 kmem_cache_free(btrfs_transaction_cachep, transaction);
40 } 41 }
@@ -49,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
49/* 50/*
50 * either allocate a new transaction or hop into the existing one 51 * either allocate a new transaction or hop into the existing one
51 */ 52 */
52static noinline int join_transaction(struct btrfs_root *root) 53static noinline int join_transaction(struct btrfs_root *root, int nofail)
53{ 54{
54 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56
57 spin_lock(&root->fs_info->trans_lock);
58 if (root->fs_info->trans_no_join) {
59 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock);
61 return -EBUSY;
62 }
63 }
64
55 cur_trans = root->fs_info->running_transaction; 65 cur_trans = root->fs_info->running_transaction;
56 if (!cur_trans) { 66 if (cur_trans) {
57 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 67 atomic_inc(&cur_trans->use_count);
58 GFP_NOFS);
59 if (!cur_trans)
60 return -ENOMEM;
61 root->fs_info->generation++;
62 atomic_set(&cur_trans->num_writers, 1);
63 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0;
69 atomic_set(&cur_trans->use_count, 1);
70 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds();
72
73 cur_trans->delayed_refs.root = RB_ROOT;
74 cur_trans->delayed_refs.num_entries = 0;
75 cur_trans->delayed_refs.num_heads_ready = 0;
76 cur_trans->delayed_refs.num_heads = 0;
77 cur_trans->delayed_refs.flushing = 0;
78 cur_trans->delayed_refs.run_delayed_start = 0;
79 spin_lock_init(&cur_trans->delayed_refs.lock);
80
81 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
82 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
83 extent_io_tree_init(&cur_trans->dirty_pages,
84 root->fs_info->btree_inode->i_mapping);
85 spin_lock(&root->fs_info->new_trans_lock);
86 root->fs_info->running_transaction = cur_trans;
87 spin_unlock(&root->fs_info->new_trans_lock);
88 } else {
89 atomic_inc(&cur_trans->num_writers); 68 atomic_inc(&cur_trans->num_writers);
90 cur_trans->num_joined++; 69 cur_trans->num_joined++;
70 spin_unlock(&root->fs_info->trans_lock);
71 return 0;
91 } 72 }
73 spin_unlock(&root->fs_info->trans_lock);
74
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans)
77 return -ENOMEM;
78 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) {
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count);
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 }
88 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait);
91 init_waitqueue_head(&cur_trans->commit_wait);
92 cur_trans->in_commit = 0;
93 cur_trans->blocked = 0;
94 /*
95 * One for this trans handle, one so it will live on until we
96 * commit the transaction.
97 */
98 atomic_set(&cur_trans->use_count, 2);
99 cur_trans->commit_done = 0;
100 cur_trans->start_time = get_seconds();
101
102 cur_trans->delayed_refs.root = RB_ROOT;
103 cur_trans->delayed_refs.num_entries = 0;
104 cur_trans->delayed_refs.num_heads_ready = 0;
105 cur_trans->delayed_refs.num_heads = 0;
106 cur_trans->delayed_refs.flushing = 0;
107 cur_trans->delayed_refs.run_delayed_start = 0;
108 spin_lock_init(&cur_trans->commit_lock);
109 spin_lock_init(&cur_trans->delayed_refs.lock);
110
111 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
112 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
113 extent_io_tree_init(&cur_trans->dirty_pages,
114 root->fs_info->btree_inode->i_mapping);
115 root->fs_info->generation++;
116 cur_trans->transid = root->fs_info->generation;
117 root->fs_info->running_transaction = cur_trans;
118 spin_unlock(&root->fs_info->trans_lock);
92 119
93 return 0; 120 return 0;
94} 121}
@@ -99,39 +126,28 @@ static noinline int join_transaction(struct btrfs_root *root)
99 * to make sure the old root from before we joined the transaction is deleted 126 * to make sure the old root from before we joined the transaction is deleted
100 * when the transaction commits 127 * when the transaction commits
101 */ 128 */
102static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, 129int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root) 130 struct btrfs_root *root)
104{ 131{
105 if (root->ref_cows && root->last_trans < trans->transid) { 132 if (root->ref_cows && root->last_trans < trans->transid) {
106 WARN_ON(root == root->fs_info->extent_root); 133 WARN_ON(root == root->fs_info->extent_root);
107 WARN_ON(root->commit_root != root->node); 134 WARN_ON(root->commit_root != root->node);
108 135
136 spin_lock(&root->fs_info->fs_roots_radix_lock);
137 if (root->last_trans == trans->transid) {
138 spin_unlock(&root->fs_info->fs_roots_radix_lock);
139 return 0;
140 }
141 root->last_trans = trans->transid;
109 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 142 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
110 (unsigned long)root->root_key.objectid, 143 (unsigned long)root->root_key.objectid,
111 BTRFS_ROOT_TRANS_TAG); 144 BTRFS_ROOT_TRANS_TAG);
112 root->last_trans = trans->transid; 145 spin_unlock(&root->fs_info->fs_roots_radix_lock);
113 btrfs_init_reloc_root(trans, root); 146 btrfs_init_reloc_root(trans, root);
114 } 147 }
115 return 0; 148 return 0;
116} 149}
117 150
118int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
119 struct btrfs_root *root)
120{
121 if (!root->ref_cows)
122 return 0;
123
124 mutex_lock(&root->fs_info->trans_mutex);
125 if (root->last_trans == trans->transid) {
126 mutex_unlock(&root->fs_info->trans_mutex);
127 return 0;
128 }
129
130 record_root_in_trans(trans, root);
131 mutex_unlock(&root->fs_info->trans_mutex);
132 return 0;
133}
134
135/* wait for commit against the current transaction to become unblocked 151/* wait for commit against the current transaction to become unblocked
136 * when this is done, it is safe to start a new transaction, but the current 152 * when this is done, it is safe to start a new transaction, but the current
137 * transaction might not be fully on disk. 153 * transaction might not be fully on disk.
@@ -140,21 +156,23 @@ static void wait_current_trans(struct btrfs_root *root)
140{ 156{
141 struct btrfs_transaction *cur_trans; 157 struct btrfs_transaction *cur_trans;
142 158
159 spin_lock(&root->fs_info->trans_lock);
143 cur_trans = root->fs_info->running_transaction; 160 cur_trans = root->fs_info->running_transaction;
144 if (cur_trans && cur_trans->blocked) { 161 if (cur_trans && cur_trans->blocked) {
145 DEFINE_WAIT(wait); 162 DEFINE_WAIT(wait);
146 atomic_inc(&cur_trans->use_count); 163 atomic_inc(&cur_trans->use_count);
164 spin_unlock(&root->fs_info->trans_lock);
147 while (1) { 165 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 166 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 167 TASK_UNINTERRUPTIBLE);
150 if (!cur_trans->blocked) 168 if (!cur_trans->blocked)
151 break; 169 break;
152 mutex_unlock(&root->fs_info->trans_mutex);
153 schedule(); 170 schedule();
154 mutex_lock(&root->fs_info->trans_mutex);
155 } 171 }
156 finish_wait(&root->fs_info->transaction_wait, &wait); 172 finish_wait(&root->fs_info->transaction_wait, &wait);
157 put_transaction(cur_trans); 173 put_transaction(cur_trans);
174 } else {
175 spin_unlock(&root->fs_info->trans_lock);
158 } 176 }
159} 177}
160 178
@@ -167,10 +185,16 @@ enum btrfs_trans_type {
167 185
168static int may_wait_transaction(struct btrfs_root *root, int type) 186static int may_wait_transaction(struct btrfs_root *root, int type)
169{ 187{
170 if (!root->fs_info->log_root_recovering && 188 if (root->fs_info->log_root_recovering)
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || 189 return 0;
172 type == TRANS_USERSPACE)) 190
191 if (type == TRANS_USERSPACE)
192 return 1;
193
194 if (type == TRANS_START &&
195 !atomic_read(&root->fs_info->open_ioctl_trans))
173 return 1; 196 return 1;
197
174 return 0; 198 return 0;
175} 199}
176 200
@@ -184,36 +208,44 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
184 208
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 209 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS); 210 return ERR_PTR(-EROFS);
211
212 if (current->journal_info) {
213 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
214 h = current->journal_info;
215 h->use_count++;
216 h->orig_rsv = h->block_rsv;
217 h->block_rsv = NULL;
218 goto got_it;
219 }
187again: 220again:
188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 221 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
189 if (!h) 222 if (!h)
190 return ERR_PTR(-ENOMEM); 223 return ERR_PTR(-ENOMEM);
191 224
192 if (type != TRANS_JOIN_NOLOCK)
193 mutex_lock(&root->fs_info->trans_mutex);
194 if (may_wait_transaction(root, type)) 225 if (may_wait_transaction(root, type))
195 wait_current_trans(root); 226 wait_current_trans(root);
196 227
197 ret = join_transaction(root); 228 do {
229 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
230 if (ret == -EBUSY)
231 wait_current_trans(root);
232 } while (ret == -EBUSY);
233
198 if (ret < 0) { 234 if (ret < 0) {
199 kmem_cache_free(btrfs_trans_handle_cachep, h); 235 kmem_cache_free(btrfs_trans_handle_cachep, h);
200 if (type != TRANS_JOIN_NOLOCK)
201 mutex_unlock(&root->fs_info->trans_mutex);
202 return ERR_PTR(ret); 236 return ERR_PTR(ret);
203 } 237 }
204 238
205 cur_trans = root->fs_info->running_transaction; 239 cur_trans = root->fs_info->running_transaction;
206 atomic_inc(&cur_trans->use_count);
207 if (type != TRANS_JOIN_NOLOCK)
208 mutex_unlock(&root->fs_info->trans_mutex);
209 240
210 h->transid = cur_trans->transid; 241 h->transid = cur_trans->transid;
211 h->transaction = cur_trans; 242 h->transaction = cur_trans;
212 h->blocks_used = 0; 243 h->blocks_used = 0;
213 h->block_group = 0;
214 h->bytes_reserved = 0; 244 h->bytes_reserved = 0;
215 h->delayed_ref_updates = 0; 245 h->delayed_ref_updates = 0;
246 h->use_count = 1;
216 h->block_rsv = NULL; 247 h->block_rsv = NULL;
248 h->orig_rsv = NULL;
217 249
218 smp_mb(); 250 smp_mb();
219 if (cur_trans->blocked && may_wait_transaction(root, type)) { 251 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -241,11 +273,8 @@ again:
241 } 273 }
242 } 274 }
243 275
244 if (type != TRANS_JOIN_NOLOCK) 276got_it:
245 mutex_lock(&root->fs_info->trans_mutex); 277 btrfs_record_root_in_trans(h, root);
246 record_root_in_trans(h, root);
247 if (type != TRANS_JOIN_NOLOCK)
248 mutex_unlock(&root->fs_info->trans_mutex);
249 278
250 if (!current->journal_info && type != TRANS_USERSPACE) 279 if (!current->journal_info && type != TRANS_USERSPACE)
251 current->journal_info = h; 280 current->journal_info = h;
@@ -257,22 +286,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
257{ 286{
258 return start_transaction(root, num_items, TRANS_START); 287 return start_transaction(root, num_items, TRANS_START);
259} 288}
260struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 289struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
261 int num_blocks)
262{ 290{
263 return start_transaction(root, 0, TRANS_JOIN); 291 return start_transaction(root, 0, TRANS_JOIN);
264} 292}
265 293
266struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, 294struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
267 int num_blocks)
268{ 295{
269 return start_transaction(root, 0, TRANS_JOIN_NOLOCK); 296 return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
270} 297}
271 298
272struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 299struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
273 int num_blocks)
274{ 300{
275 return start_transaction(r, 0, TRANS_USERSPACE); 301 return start_transaction(root, 0, TRANS_USERSPACE);
276} 302}
277 303
278/* wait for a transaction commit to be fully complete */ 304/* wait for a transaction commit to be fully complete */
@@ -280,17 +306,13 @@ static noinline int wait_for_commit(struct btrfs_root *root,
280 struct btrfs_transaction *commit) 306 struct btrfs_transaction *commit)
281{ 307{
282 DEFINE_WAIT(wait); 308 DEFINE_WAIT(wait);
283 mutex_lock(&root->fs_info->trans_mutex);
284 while (!commit->commit_done) { 309 while (!commit->commit_done) {
285 prepare_to_wait(&commit->commit_wait, &wait, 310 prepare_to_wait(&commit->commit_wait, &wait,
286 TASK_UNINTERRUPTIBLE); 311 TASK_UNINTERRUPTIBLE);
287 if (commit->commit_done) 312 if (commit->commit_done)
288 break; 313 break;
289 mutex_unlock(&root->fs_info->trans_mutex);
290 schedule(); 314 schedule();
291 mutex_lock(&root->fs_info->trans_mutex);
292 } 315 }
293 mutex_unlock(&root->fs_info->trans_mutex);
294 finish_wait(&commit->commit_wait, &wait); 316 finish_wait(&commit->commit_wait, &wait);
295 return 0; 317 return 0;
296} 318}
@@ -300,59 +322,56 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
300 struct btrfs_transaction *cur_trans = NULL, *t; 322 struct btrfs_transaction *cur_trans = NULL, *t;
301 int ret; 323 int ret;
302 324
303 mutex_lock(&root->fs_info->trans_mutex);
304
305 ret = 0; 325 ret = 0;
306 if (transid) { 326 if (transid) {
307 if (transid <= root->fs_info->last_trans_committed) 327 if (transid <= root->fs_info->last_trans_committed)
308 goto out_unlock; 328 goto out;
309 329
310 /* find specified transaction */ 330 /* find specified transaction */
331 spin_lock(&root->fs_info->trans_lock);
311 list_for_each_entry(t, &root->fs_info->trans_list, list) { 332 list_for_each_entry(t, &root->fs_info->trans_list, list) {
312 if (t->transid == transid) { 333 if (t->transid == transid) {
313 cur_trans = t; 334 cur_trans = t;
335 atomic_inc(&cur_trans->use_count);
314 break; 336 break;
315 } 337 }
316 if (t->transid > transid) 338 if (t->transid > transid)
317 break; 339 break;
318 } 340 }
341 spin_unlock(&root->fs_info->trans_lock);
319 ret = -EINVAL; 342 ret = -EINVAL;
320 if (!cur_trans) 343 if (!cur_trans)
321 goto out_unlock; /* bad transid */ 344 goto out; /* bad transid */
322 } else { 345 } else {
323 /* find newest transaction that is committing | committed */ 346 /* find newest transaction that is committing | committed */
347 spin_lock(&root->fs_info->trans_lock);
324 list_for_each_entry_reverse(t, &root->fs_info->trans_list, 348 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
325 list) { 349 list) {
326 if (t->in_commit) { 350 if (t->in_commit) {
327 if (t->commit_done) 351 if (t->commit_done)
328 goto out_unlock; 352 goto out;
329 cur_trans = t; 353 cur_trans = t;
354 atomic_inc(&cur_trans->use_count);
330 break; 355 break;
331 } 356 }
332 } 357 }
358 spin_unlock(&root->fs_info->trans_lock);
333 if (!cur_trans) 359 if (!cur_trans)
334 goto out_unlock; /* nothing committing|committed */ 360 goto out; /* nothing committing|committed */
335 } 361 }
336 362
337 atomic_inc(&cur_trans->use_count);
338 mutex_unlock(&root->fs_info->trans_mutex);
339
340 wait_for_commit(root, cur_trans); 363 wait_for_commit(root, cur_trans);
341 364
342 mutex_lock(&root->fs_info->trans_mutex);
343 put_transaction(cur_trans); 365 put_transaction(cur_trans);
344 ret = 0; 366 ret = 0;
345out_unlock: 367out:
346 mutex_unlock(&root->fs_info->trans_mutex);
347 return ret; 368 return ret;
348} 369}
349 370
350void btrfs_throttle(struct btrfs_root *root) 371void btrfs_throttle(struct btrfs_root *root)
351{ 372{
352 mutex_lock(&root->fs_info->trans_mutex); 373 if (!atomic_read(&root->fs_info->open_ioctl_trans))
353 if (!root->fs_info->open_ioctl_trans)
354 wait_current_trans(root); 374 wait_current_trans(root);
355 mutex_unlock(&root->fs_info->trans_mutex);
356} 375}
357 376
358static int should_end_transaction(struct btrfs_trans_handle *trans, 377static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -370,6 +389,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
370 struct btrfs_transaction *cur_trans = trans->transaction; 389 struct btrfs_transaction *cur_trans = trans->transaction;
371 int updates; 390 int updates;
372 391
392 smp_mb();
373 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 393 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
374 return 1; 394 return 1;
375 395
@@ -388,6 +408,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
388 struct btrfs_fs_info *info = root->fs_info; 408 struct btrfs_fs_info *info = root->fs_info;
389 int count = 0; 409 int count = 0;
390 410
411 if (--trans->use_count) {
412 trans->block_rsv = trans->orig_rsv;
413 return 0;
414 }
415
391 while (count < 4) { 416 while (count < 4) {
392 unsigned long cur = trans->delayed_ref_updates; 417 unsigned long cur = trans->delayed_ref_updates;
393 trans->delayed_ref_updates = 0; 418 trans->delayed_ref_updates = 0;
@@ -410,9 +435,11 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
410 435
411 btrfs_trans_release_metadata(trans, root); 436 btrfs_trans_release_metadata(trans, root);
412 437
413 if (lock && !root->fs_info->open_ioctl_trans && 438 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
414 should_end_transaction(trans, root)) 439 should_end_transaction(trans, root)) {
415 trans->transaction->blocked = 1; 440 trans->transaction->blocked = 1;
441 smp_wmb();
442 }
416 443
417 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 444 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
418 if (throttle) 445 if (throttle)
@@ -703,9 +730,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
703 */ 730 */
704int btrfs_add_dead_root(struct btrfs_root *root) 731int btrfs_add_dead_root(struct btrfs_root *root)
705{ 732{
706 mutex_lock(&root->fs_info->trans_mutex); 733 spin_lock(&root->fs_info->trans_lock);
707 list_add(&root->root_list, &root->fs_info->dead_roots); 734 list_add(&root->root_list, &root->fs_info->dead_roots);
708 mutex_unlock(&root->fs_info->trans_mutex); 735 spin_unlock(&root->fs_info->trans_lock);
709 return 0; 736 return 0;
710} 737}
711 738
@@ -721,6 +748,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
721 int ret; 748 int ret;
722 int err = 0; 749 int err = 0;
723 750
751 spin_lock(&fs_info->fs_roots_radix_lock);
724 while (1) { 752 while (1) {
725 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 753 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
726 (void **)gang, 0, 754 (void **)gang, 0,
@@ -733,6 +761,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
733 radix_tree_tag_clear(&fs_info->fs_roots_radix, 761 radix_tree_tag_clear(&fs_info->fs_roots_radix,
734 (unsigned long)root->root_key.objectid, 762 (unsigned long)root->root_key.objectid,
735 BTRFS_ROOT_TRANS_TAG); 763 BTRFS_ROOT_TRANS_TAG);
764 spin_unlock(&fs_info->fs_roots_radix_lock);
736 765
737 btrfs_free_log(trans, root); 766 btrfs_free_log(trans, root);
738 btrfs_update_reloc_root(trans, root); 767 btrfs_update_reloc_root(trans, root);
@@ -753,10 +782,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
753 err = btrfs_update_root(trans, fs_info->tree_root, 782 err = btrfs_update_root(trans, fs_info->tree_root,
754 &root->root_key, 783 &root->root_key,
755 &root->root_item); 784 &root->root_item);
785 spin_lock(&fs_info->fs_roots_radix_lock);
756 if (err) 786 if (err)
757 break; 787 break;
758 } 788 }
759 } 789 }
790 spin_unlock(&fs_info->fs_roots_radix_lock);
760 return err; 791 return err;
761} 792}
762 793
@@ -786,7 +817,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
786 btrfs_btree_balance_dirty(info->tree_root, nr); 817 btrfs_btree_balance_dirty(info->tree_root, nr);
787 cond_resched(); 818 cond_resched();
788 819
789 if (root->fs_info->closing || ret != -EAGAIN) 820 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
790 break; 821 break;
791 } 822 }
792 root->defrag_running = 0; 823 root->defrag_running = 0;
@@ -851,7 +882,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
851 parent = dget_parent(dentry); 882 parent = dget_parent(dentry);
852 parent_inode = parent->d_inode; 883 parent_inode = parent->d_inode;
853 parent_root = BTRFS_I(parent_inode)->root; 884 parent_root = BTRFS_I(parent_inode)->root;
854 record_root_in_trans(trans, parent_root); 885 btrfs_record_root_in_trans(trans, parent_root);
855 886
856 /* 887 /*
857 * insert the directory item 888 * insert the directory item
@@ -869,7 +900,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
869 ret = btrfs_update_inode(trans, parent_root, parent_inode); 900 ret = btrfs_update_inode(trans, parent_root, parent_inode);
870 BUG_ON(ret); 901 BUG_ON(ret);
871 902
872 record_root_in_trans(trans, root); 903 btrfs_record_root_in_trans(trans, root);
873 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 904 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
874 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 905 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
875 btrfs_check_and_init_root_item(new_root_item); 906 btrfs_check_and_init_root_item(new_root_item);
@@ -967,20 +998,20 @@ static void update_super_roots(struct btrfs_root *root)
967int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 998int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
968{ 999{
969 int ret = 0; 1000 int ret = 0;
970 spin_lock(&info->new_trans_lock); 1001 spin_lock(&info->trans_lock);
971 if (info->running_transaction) 1002 if (info->running_transaction)
972 ret = info->running_transaction->in_commit; 1003 ret = info->running_transaction->in_commit;
973 spin_unlock(&info->new_trans_lock); 1004 spin_unlock(&info->trans_lock);
974 return ret; 1005 return ret;
975} 1006}
976 1007
977int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1008int btrfs_transaction_blocked(struct btrfs_fs_info *info)
978{ 1009{
979 int ret = 0; 1010 int ret = 0;
980 spin_lock(&info->new_trans_lock); 1011 spin_lock(&info->trans_lock);
981 if (info->running_transaction) 1012 if (info->running_transaction)
982 ret = info->running_transaction->blocked; 1013 ret = info->running_transaction->blocked;
983 spin_unlock(&info->new_trans_lock); 1014 spin_unlock(&info->trans_lock);
984 return ret; 1015 return ret;
985} 1016}
986 1017
@@ -1004,9 +1035,7 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
1004 &wait); 1035 &wait);
1005 break; 1036 break;
1006 } 1037 }
1007 mutex_unlock(&root->fs_info->trans_mutex);
1008 schedule(); 1038 schedule();
1009 mutex_lock(&root->fs_info->trans_mutex);
1010 finish_wait(&root->fs_info->transaction_blocked_wait, &wait); 1039 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1011 } 1040 }
1012} 1041}
@@ -1032,9 +1061,7 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1032 &wait); 1061 &wait);
1033 break; 1062 break;
1034 } 1063 }
1035 mutex_unlock(&root->fs_info->trans_mutex);
1036 schedule(); 1064 schedule();
1037 mutex_lock(&root->fs_info->trans_mutex);
1038 finish_wait(&root->fs_info->transaction_wait, 1065 finish_wait(&root->fs_info->transaction_wait,
1039 &wait); 1066 &wait);
1040 } 1067 }
@@ -1072,7 +1099,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1072 1099
1073 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1100 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1074 ac->root = root; 1101 ac->root = root;
1075 ac->newtrans = btrfs_join_transaction(root, 0); 1102 ac->newtrans = btrfs_join_transaction(root);
1076 if (IS_ERR(ac->newtrans)) { 1103 if (IS_ERR(ac->newtrans)) {
1077 int err = PTR_ERR(ac->newtrans); 1104 int err = PTR_ERR(ac->newtrans);
1078 kfree(ac); 1105 kfree(ac);
@@ -1080,22 +1107,18 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1080 } 1107 }
1081 1108
1082 /* take transaction reference */ 1109 /* take transaction reference */
1083 mutex_lock(&root->fs_info->trans_mutex);
1084 cur_trans = trans->transaction; 1110 cur_trans = trans->transaction;
1085 atomic_inc(&cur_trans->use_count); 1111 atomic_inc(&cur_trans->use_count);
1086 mutex_unlock(&root->fs_info->trans_mutex);
1087 1112
1088 btrfs_end_transaction(trans, root); 1113 btrfs_end_transaction(trans, root);
1089 schedule_delayed_work(&ac->work, 0); 1114 schedule_delayed_work(&ac->work, 0);
1090 1115
1091 /* wait for transaction to start and unblock */ 1116 /* wait for transaction to start and unblock */
1092 mutex_lock(&root->fs_info->trans_mutex);
1093 if (wait_for_unblock) 1117 if (wait_for_unblock)
1094 wait_current_trans_commit_start_and_unblock(root, cur_trans); 1118 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1095 else 1119 else
1096 wait_current_trans_commit_start(root, cur_trans); 1120 wait_current_trans_commit_start(root, cur_trans);
1097 put_transaction(cur_trans); 1121 put_transaction(cur_trans);
1098 mutex_unlock(&root->fs_info->trans_mutex);
1099 1122
1100 return 0; 1123 return 0;
1101} 1124}
@@ -1139,38 +1162,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1139 ret = btrfs_run_delayed_refs(trans, root, 0); 1162 ret = btrfs_run_delayed_refs(trans, root, 0);
1140 BUG_ON(ret); 1163 BUG_ON(ret);
1141 1164
1142 mutex_lock(&root->fs_info->trans_mutex); 1165 spin_lock(&cur_trans->commit_lock);
1143 if (cur_trans->in_commit) { 1166 if (cur_trans->in_commit) {
1167 spin_unlock(&cur_trans->commit_lock);
1144 atomic_inc(&cur_trans->use_count); 1168 atomic_inc(&cur_trans->use_count);
1145 mutex_unlock(&root->fs_info->trans_mutex);
1146 btrfs_end_transaction(trans, root); 1169 btrfs_end_transaction(trans, root);
1147 1170
1148 ret = wait_for_commit(root, cur_trans); 1171 ret = wait_for_commit(root, cur_trans);
1149 BUG_ON(ret); 1172 BUG_ON(ret);
1150 1173
1151 mutex_lock(&root->fs_info->trans_mutex);
1152 put_transaction(cur_trans); 1174 put_transaction(cur_trans);
1153 mutex_unlock(&root->fs_info->trans_mutex);
1154 1175
1155 return 0; 1176 return 0;
1156 } 1177 }
1157 1178
1158 trans->transaction->in_commit = 1; 1179 trans->transaction->in_commit = 1;
1159 trans->transaction->blocked = 1; 1180 trans->transaction->blocked = 1;
1181 spin_unlock(&cur_trans->commit_lock);
1160 wake_up(&root->fs_info->transaction_blocked_wait); 1182 wake_up(&root->fs_info->transaction_blocked_wait);
1161 1183
1184 spin_lock(&root->fs_info->trans_lock);
1162 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1185 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1163 prev_trans = list_entry(cur_trans->list.prev, 1186 prev_trans = list_entry(cur_trans->list.prev,
1164 struct btrfs_transaction, list); 1187 struct btrfs_transaction, list);
1165 if (!prev_trans->commit_done) { 1188 if (!prev_trans->commit_done) {
1166 atomic_inc(&prev_trans->use_count); 1189 atomic_inc(&prev_trans->use_count);
1167 mutex_unlock(&root->fs_info->trans_mutex); 1190 spin_unlock(&root->fs_info->trans_lock);
1168 1191
1169 wait_for_commit(root, prev_trans); 1192 wait_for_commit(root, prev_trans);
1170 1193
1171 mutex_lock(&root->fs_info->trans_mutex);
1172 put_transaction(prev_trans); 1194 put_transaction(prev_trans);
1195 } else {
1196 spin_unlock(&root->fs_info->trans_lock);
1173 } 1197 }
1198 } else {
1199 spin_unlock(&root->fs_info->trans_lock);
1174 } 1200 }
1175 1201
1176 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1202 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1178,12 +1204,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1178 1204
1179 do { 1205 do {
1180 int snap_pending = 0; 1206 int snap_pending = 0;
1207
1181 joined = cur_trans->num_joined; 1208 joined = cur_trans->num_joined;
1182 if (!list_empty(&trans->transaction->pending_snapshots)) 1209 if (!list_empty(&trans->transaction->pending_snapshots))
1183 snap_pending = 1; 1210 snap_pending = 1;
1184 1211
1185 WARN_ON(cur_trans != trans->transaction); 1212 WARN_ON(cur_trans != trans->transaction);
1186 mutex_unlock(&root->fs_info->trans_mutex);
1187 1213
1188 if (flush_on_commit || snap_pending) { 1214 if (flush_on_commit || snap_pending) {
1189 btrfs_start_delalloc_inodes(root, 1); 1215 btrfs_start_delalloc_inodes(root, 1);
@@ -1206,14 +1232,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1206 prepare_to_wait(&cur_trans->writer_wait, &wait, 1232 prepare_to_wait(&cur_trans->writer_wait, &wait,
1207 TASK_UNINTERRUPTIBLE); 1233 TASK_UNINTERRUPTIBLE);
1208 1234
1209 smp_mb();
1210 if (atomic_read(&cur_trans->num_writers) > 1) 1235 if (atomic_read(&cur_trans->num_writers) > 1)
1211 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1236 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1212 else if (should_grow) 1237 else if (should_grow)
1213 schedule_timeout(1); 1238 schedule_timeout(1);
1214 1239
1215 mutex_lock(&root->fs_info->trans_mutex);
1216 finish_wait(&cur_trans->writer_wait, &wait); 1240 finish_wait(&cur_trans->writer_wait, &wait);
1241 spin_lock(&root->fs_info->trans_lock);
1242 root->fs_info->trans_no_join = 1;
1243 spin_unlock(&root->fs_info->trans_lock);
1217 } while (atomic_read(&cur_trans->num_writers) > 1 || 1244 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1218 (should_grow && cur_trans->num_joined != joined)); 1245 (should_grow && cur_trans->num_joined != joined));
1219 1246
@@ -1258,9 +1285,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1258 btrfs_prepare_extent_commit(trans, root); 1285 btrfs_prepare_extent_commit(trans, root);
1259 1286
1260 cur_trans = root->fs_info->running_transaction; 1287 cur_trans = root->fs_info->running_transaction;
1261 spin_lock(&root->fs_info->new_trans_lock);
1262 root->fs_info->running_transaction = NULL;
1263 spin_unlock(&root->fs_info->new_trans_lock);
1264 1288
1265 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1289 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1266 root->fs_info->tree_root->node); 1290 root->fs_info->tree_root->node);
@@ -1281,10 +1305,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1281 sizeof(root->fs_info->super_copy)); 1305 sizeof(root->fs_info->super_copy));
1282 1306
1283 trans->transaction->blocked = 0; 1307 trans->transaction->blocked = 0;
1308 spin_lock(&root->fs_info->trans_lock);
1309 root->fs_info->running_transaction = NULL;
1310 root->fs_info->trans_no_join = 0;
1311 spin_unlock(&root->fs_info->trans_lock);
1284 1312
1285 wake_up(&root->fs_info->transaction_wait); 1313 wake_up(&root->fs_info->transaction_wait);
1286 1314
1287 mutex_unlock(&root->fs_info->trans_mutex);
1288 ret = btrfs_write_and_wait_transaction(trans, root); 1315 ret = btrfs_write_and_wait_transaction(trans, root);
1289 BUG_ON(ret); 1316 BUG_ON(ret);
1290 write_ctree_super(trans, root, 0); 1317 write_ctree_super(trans, root, 0);
@@ -1297,22 +1324,21 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1297 1324
1298 btrfs_finish_extent_commit(trans, root); 1325 btrfs_finish_extent_commit(trans, root);
1299 1326
1300 mutex_lock(&root->fs_info->trans_mutex);
1301
1302 cur_trans->commit_done = 1; 1327 cur_trans->commit_done = 1;
1303 1328
1304 root->fs_info->last_trans_committed = cur_trans->transid; 1329 root->fs_info->last_trans_committed = cur_trans->transid;
1305 1330
1306 wake_up(&cur_trans->commit_wait); 1331 wake_up(&cur_trans->commit_wait);
1307 1332
1333 spin_lock(&root->fs_info->trans_lock);
1308 list_del_init(&cur_trans->list); 1334 list_del_init(&cur_trans->list);
1335 spin_unlock(&root->fs_info->trans_lock);
1336
1309 put_transaction(cur_trans); 1337 put_transaction(cur_trans);
1310 put_transaction(cur_trans); 1338 put_transaction(cur_trans);
1311 1339
1312 trace_btrfs_transaction_commit(root); 1340 trace_btrfs_transaction_commit(root);
1313 1341
1314 mutex_unlock(&root->fs_info->trans_mutex);
1315
1316 btrfs_scrub_continue(root); 1342 btrfs_scrub_continue(root);
1317 1343
1318 if (current->journal_info == trans) 1344 if (current->journal_info == trans)
@@ -1334,9 +1360,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1334 LIST_HEAD(list); 1360 LIST_HEAD(list);
1335 struct btrfs_fs_info *fs_info = root->fs_info; 1361 struct btrfs_fs_info *fs_info = root->fs_info;
1336 1362
1337 mutex_lock(&fs_info->trans_mutex); 1363 spin_lock(&fs_info->trans_lock);
1338 list_splice_init(&fs_info->dead_roots, &list); 1364 list_splice_init(&fs_info->dead_roots, &list);
1339 mutex_unlock(&fs_info->trans_mutex); 1365 spin_unlock(&fs_info->trans_lock);
1340 1366
1341 while (!list_empty(&list)) { 1367 while (!list_empty(&list)) {
1342 root = list_entry(list.next, struct btrfs_root, root_list); 1368 root = list_entry(list.next, struct btrfs_root, root_list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 804c88639e5d..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -28,10 +28,12 @@ struct btrfs_transaction {
28 * transaction can end 28 * transaction can end
29 */ 29 */
30 atomic_t num_writers; 30 atomic_t num_writers;
31 atomic_t use_count;
31 32
32 unsigned long num_joined; 33 unsigned long num_joined;
34
35 spinlock_t commit_lock;
33 int in_commit; 36 int in_commit;
34 atomic_t use_count;
35 int commit_done; 37 int commit_done;
36 int blocked; 38 int blocked;
37 struct list_head list; 39 struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
45 47
46struct btrfs_trans_handle { 48struct btrfs_trans_handle {
47 u64 transid; 49 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved; 50 u64 bytes_reserved;
51 unsigned long use_count;
50 unsigned long blocks_reserved; 52 unsigned long blocks_reserved;
51 unsigned long blocks_used; 53 unsigned long blocks_used;
52 unsigned long delayed_ref_updates; 54 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction; 55 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv; 56 struct btrfs_block_rsv *block_rsv;
57 struct btrfs_block_rsv *orig_rsv;
55}; 58};
56 59
57struct btrfs_pending_snapshot { 60struct btrfs_pending_snapshot {
@@ -66,19 +69,6 @@ struct btrfs_pending_snapshot {
66 struct list_head list; 69 struct list_head list;
67}; 70};
68 71
69static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
70 struct inode *inode)
71{
72 trans->block_group = BTRFS_I(inode)->block_group;
73}
74
75static inline void btrfs_update_inode_block_group(
76 struct btrfs_trans_handle *trans,
77 struct inode *inode)
78{
79 BTRFS_I(inode)->block_group = trans->block_group;
80}
81
82static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, 72static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
83 struct inode *inode) 73 struct inode *inode)
84{ 74{
@@ -92,12 +82,9 @@ int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
92 struct btrfs_root *root); 82 struct btrfs_root *root);
93struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 83struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
94 int num_items); 84 int num_items);
95struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 85struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
96 int num_blocks); 86struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
97struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root, 87struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
98 int num_blocks);
99struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
100 int num_blocks);
101int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); 88int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
102int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 89int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 90 struct btrfs_root *root);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c48214ef5c09..da541dfca2e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -504,7 +504,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
504 BUG_ON(!new_device); 504 BUG_ON(!new_device);
505 memcpy(new_device, device, sizeof(*new_device)); 505 memcpy(new_device, device, sizeof(*new_device));
506 new_device->name = kstrdup(device->name, GFP_NOFS); 506 new_device->name = kstrdup(device->name, GFP_NOFS);
507 BUG_ON(!new_device->name); 507 BUG_ON(device->name && !new_device->name);
508 new_device->bdev = NULL; 508 new_device->bdev = NULL;
509 new_device->writeable = 0; 509 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0; 510 new_device->in_fs_metadata = 0;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f3107e4b4d56..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
158 if (IS_ERR(trans)) 158 if (IS_ERR(trans))
159 return PTR_ERR(trans); 159 return PTR_ERR(trans);
160 160
161 btrfs_set_trans_block_group(trans, inode);
162
163 ret = do_setxattr(trans, inode, name, value, size, flags); 161 ret = do_setxattr(trans, inode, name, value, size, flags);
164 if (ret) 162 if (ret)
165 goto out; 163 goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc462..49c9aada0374 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2382 ret = -EAGAIN; 2382 ret = -EAGAIN;
2383 goto out_unlock; 2383 goto out_unlock;
2384 } 2384 }
2385 wait_on_page_writeback(page);
2385 return 0; 2386 return 0;
2386out_unlock: 2387out_unlock:
2387 unlock_page(page); 2388 unlock_page(page);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b41..21de1d6d5849 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
74 * Run idmap cache shrinker. 74 * Run idmap cache shrinker.
75 */ 75 */
76static int 76static int
77cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 77cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
78{ 78{
79 int nr_to_scan = sc->nr_to_scan;
79 int nr_del = 0; 80 int nr_del = 0;
80 int nr_rem = 0; 81 int nr_rem = 0;
81 struct rb_root *root; 82 struct rb_root *root;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd5735..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
336 int len = de->d_name.len; 336 int len = de->d_name.len;
337 int error; 337 int error;
338 338
339 dentry_unhash(de);
340
341 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 339 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
342 if (!error) { 340 if (!error) {
343 /* VFS may delete the child */ 341 /* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
361 int new_length = new_dentry->d_name.len; 359 int new_length = new_dentry->d_name.len;
362 int error; 360 int error;
363 361
364 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
365 dentry_unhash(new_dentry);
366
367 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 362 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
368 coda_i2f(new_dir), old_length, new_length, 363 coda_i2f(new_dir), old_length, new_length,
369 (const char *) old_name, (const char *)new_name); 364 (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc5..9a37a9b6de3a 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL; 1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL;
1360 int ret; 1360 int ret;
1361 1361
1362 dentry_unhash(dentry);
1363
1364 if (dentry->d_parent == configfs_sb->s_root) 1362 if (dentry->d_parent == configfs_sb->s_root)
1365 return -EPERM; 1363 return -EPERM;
1366 1364
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c8091024..58609bde3b9f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
1024} 1024}
1025 1025
1026/** 1026/**
1027 * contains_ecryptfs_marker - check for the ecryptfs marker 1027 * ecryptfs_validate_marker - check for the ecryptfs marker
1028 * @data: The data block in which to check 1028 * @data: The data block in which to check
1029 * 1029 *
1030 * Returns one if marker found; zero if not found 1030 * Returns zero if marker found; -EINVAL if not found
1031 */ 1031 */
1032static int contains_ecryptfs_marker(char *data) 1032static int ecryptfs_validate_marker(char *data)
1033{ 1033{
1034 u32 m_1, m_2; 1034 u32 m_1, m_2;
1035 1035
1036 m_1 = get_unaligned_be32(data); 1036 m_1 = get_unaligned_be32(data);
1037 m_2 = get_unaligned_be32(data + 4); 1037 m_2 = get_unaligned_be32(data + 4);
1038 if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) 1038 if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
1039 return 1; 1039 return 0;
1040 ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " 1040 ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
1041 "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, 1041 "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
1042 MAGIC_ECRYPTFS_MARKER); 1042 MAGIC_ECRYPTFS_MARKER);
1043 ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " 1043 ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
1044 "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); 1044 "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
1045 return 0; 1045 return -EINVAL;
1046} 1046}
1047 1047
1048struct ecryptfs_flag_map_elem { 1048struct ecryptfs_flag_map_elem {
@@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
1201 return rc; 1201 return rc;
1202} 1202}
1203 1203
1204int ecryptfs_read_and_validate_header_region(char *data, 1204int ecryptfs_read_and_validate_header_region(struct inode *inode)
1205 struct inode *ecryptfs_inode)
1206{ 1205{
1207 struct ecryptfs_crypt_stat *crypt_stat = 1206 u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
1208 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 1207 u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
1209 int rc; 1208 int rc;
1210 1209
1211 if (crypt_stat->extent_size == 0) 1210 rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
1212 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; 1211 inode);
1213 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1212 if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
1214 ecryptfs_inode); 1213 return rc >= 0 ? -EINVAL : rc;
1215 if (rc < 0) { 1214 rc = ecryptfs_validate_marker(marker);
1216 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", 1215 if (!rc)
1217 __func__, rc); 1216 ecryptfs_i_size_init(file_size, inode);
1218 goto out;
1219 }
1220 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1221 rc = -EINVAL;
1222 } else
1223 rc = 0;
1224out:
1225 return rc; 1217 return rc;
1226} 1218}
1227 1219
@@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
1242 (*written) = 6; 1234 (*written) = 6;
1243} 1235}
1244 1236
1245struct kmem_cache *ecryptfs_header_cache_1; 1237struct kmem_cache *ecryptfs_header_cache;
1246struct kmem_cache *ecryptfs_header_cache_2;
1247 1238
1248/** 1239/**
1249 * ecryptfs_write_headers_virt 1240 * ecryptfs_write_headers_virt
@@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
1496 crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( 1487 crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
1497 ecryptfs_dentry->d_sb)->mount_crypt_stat; 1488 ecryptfs_dentry->d_sb)->mount_crypt_stat;
1498 offset = ECRYPTFS_FILE_SIZE_BYTES; 1489 offset = ECRYPTFS_FILE_SIZE_BYTES;
1499 rc = contains_ecryptfs_marker(page_virt + offset); 1490 rc = ecryptfs_validate_marker(page_virt + offset);
1500 if (rc == 0) { 1491 if (rc)
1501 rc = -EINVAL;
1502 goto out; 1492 goto out;
1503 }
1504 if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) 1493 if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
1505 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); 1494 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
1506 offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1495 offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1567,20 +1556,21 @@ out:
1567 return rc; 1556 return rc;
1568} 1557}
1569 1558
1570int ecryptfs_read_and_validate_xattr_region(char *page_virt, 1559int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
1571 struct dentry *ecryptfs_dentry) 1560 struct inode *inode)
1572{ 1561{
1562 u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
1563 u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
1573 int rc; 1564 int rc;
1574 1565
1575 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode); 1566 rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
1576 if (rc) 1567 ECRYPTFS_XATTR_NAME, file_size,
1577 goto out; 1568 ECRYPTFS_SIZE_AND_MARKER_BYTES);
1578 if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) { 1569 if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
1579 printk(KERN_WARNING "Valid data found in [%s] xattr, but " 1570 return rc >= 0 ? -EINVAL : rc;
1580 "the marker is invalid\n", ECRYPTFS_XATTR_NAME); 1571 rc = ecryptfs_validate_marker(marker);
1581 rc = -EINVAL; 1572 if (!rc)
1582 } 1573 ecryptfs_i_size_init(file_size, inode);
1583out:
1584 return rc; 1574 return rc;
1585} 1575}
1586 1576
@@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1610 ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, 1600 ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
1611 mount_crypt_stat); 1601 mount_crypt_stat);
1612 /* Read the first page from the underlying file */ 1602 /* Read the first page from the underlying file */
1613 page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER); 1603 page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
1614 if (!page_virt) { 1604 if (!page_virt) {
1615 rc = -ENOMEM; 1605 rc = -ENOMEM;
1616 printk(KERN_ERR "%s: Unable to allocate page_virt\n", 1606 printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1655out: 1645out:
1656 if (page_virt) { 1646 if (page_virt) {
1657 memset(page_virt, 0, PAGE_CACHE_SIZE); 1647 memset(page_virt, 0, PAGE_CACHE_SIZE);
1658 kmem_cache_free(ecryptfs_header_cache_1, page_virt); 1648 kmem_cache_free(ecryptfs_header_cache, page_virt);
1659 } 1649 }
1660 return rc; 1650 return rc;
1661} 1651}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2c..43c7c43b06f5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
200#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 200#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
201#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ 201#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
202#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) 202#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
203#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
204 + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
203#define ECRYPTFS_DEFAULT_CIPHER "aes" 205#define ECRYPTFS_DEFAULT_CIPHER "aes"
204#define ECRYPTFS_DEFAULT_KEY_BYTES 16 206#define ECRYPTFS_DEFAULT_KEY_BYTES 16
205#define ECRYPTFS_DEFAULT_HASH "md5" 207#define ECRYPTFS_DEFAULT_HASH "md5"
@@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
603extern struct kmem_cache *ecryptfs_dentry_info_cache; 605extern struct kmem_cache *ecryptfs_dentry_info_cache;
604extern struct kmem_cache *ecryptfs_inode_info_cache; 606extern struct kmem_cache *ecryptfs_inode_info_cache;
605extern struct kmem_cache *ecryptfs_sb_info_cache; 607extern struct kmem_cache *ecryptfs_sb_info_cache;
606extern struct kmem_cache *ecryptfs_header_cache_1; 608extern struct kmem_cache *ecryptfs_header_cache;
607extern struct kmem_cache *ecryptfs_header_cache_2;
608extern struct kmem_cache *ecryptfs_xattr_cache; 609extern struct kmem_cache *ecryptfs_xattr_cache;
609extern struct kmem_cache *ecryptfs_key_record_cache; 610extern struct kmem_cache *ecryptfs_key_record_cache;
610extern struct kmem_cache *ecryptfs_key_sig_cache; 611extern struct kmem_cache *ecryptfs_key_sig_cache;
@@ -625,14 +626,9 @@ struct ecryptfs_open_req {
625 struct list_head kthread_ctl_list; 626 struct list_head kthread_ctl_list;
626}; 627};
627 628
628#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001 629struct inode *ecryptfs_get_inode(struct inode *lower_inode,
629int ecryptfs_interpose(struct dentry *hidden_dentry, 630 struct super_block *sb);
630 struct dentry *this_dentry, struct super_block *sb,
631 u32 flags);
632void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); 631void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
633int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
634 struct dentry *lower_dentry,
635 struct inode *ecryptfs_dir_inode);
636int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 632int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
637 size_t *decrypted_name_size, 633 size_t *decrypted_name_size,
638 struct dentry *ecryptfs_dentry, 634 struct dentry *ecryptfs_dentry,
@@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
664void ecryptfs_write_crypt_stat_flags(char *page_virt, 660void ecryptfs_write_crypt_stat_flags(char *page_virt,
665 struct ecryptfs_crypt_stat *crypt_stat, 661 struct ecryptfs_crypt_stat *crypt_stat,
666 size_t *written); 662 size_t *written);
667int ecryptfs_read_and_validate_header_region(char *data, 663int ecryptfs_read_and_validate_header_region(struct inode *inode);
668 struct inode *ecryptfs_inode); 664int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
669int ecryptfs_read_and_validate_xattr_region(char *page_virt, 665 struct inode *inode);
670 struct dentry *ecryptfs_dentry);
671u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); 666u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
672int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); 667int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
673void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); 668void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -679,9 +674,6 @@ int
679ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, 674ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
680 unsigned char *src, struct dentry *ecryptfs_dentry); 675 unsigned char *src, struct dentry *ecryptfs_dentry);
681int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); 676int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
682int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
683int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
684void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
685ssize_t 677ssize_t
686ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, 678ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
687 void *value, size_t size); 679 void *value, size_t size);
@@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
761 struct dentry *lower_dentry, 753 struct dentry *lower_dentry,
762 struct vfsmount *lower_mnt, 754 struct vfsmount *lower_mnt,
763 const struct cred *cred); 755 const struct cred *cred);
764int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry); 756int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
765void ecryptfs_put_lower_file(struct inode *inode); 757void ecryptfs_put_lower_file(struct inode *inode);
766int 758int
767ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, 759ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78c..4ec9eb00a241 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
192 } 192 }
193 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
194 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 194 rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
195 if (rc) { 195 if (rc) {
196 printk(KERN_ERR "%s: Error attempting to initialize " 196 printk(KERN_ERR "%s: Error attempting to initialize "
197 "the lower file for the dentry with name " 197 "the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bc116b9ffcf2..7349ade17de6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir)
51 dput(dir); 51 dput(dir);
52} 52}
53 53
54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
55{
56 if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
57 return 1;
58 return 0;
59}
60
61static int ecryptfs_inode_set(struct inode *inode, void *opaque)
62{
63 struct inode *lower_inode = opaque;
64
65 ecryptfs_set_inode_lower(inode, lower_inode);
66 fsstack_copy_attr_all(inode, lower_inode);
67 /* i_size will be overwritten for encrypted regular files */
68 fsstack_copy_inode_size(inode, lower_inode);
69 inode->i_ino = lower_inode->i_ino;
70 inode->i_version++;
71 inode->i_mapping->a_ops = &ecryptfs_aops;
72
73 if (S_ISLNK(inode->i_mode))
74 inode->i_op = &ecryptfs_symlink_iops;
75 else if (S_ISDIR(inode->i_mode))
76 inode->i_op = &ecryptfs_dir_iops;
77 else
78 inode->i_op = &ecryptfs_main_iops;
79
80 if (S_ISDIR(inode->i_mode))
81 inode->i_fop = &ecryptfs_dir_fops;
82 else if (special_file(inode->i_mode))
83 init_special_inode(inode, inode->i_mode, inode->i_rdev);
84 else
85 inode->i_fop = &ecryptfs_main_fops;
86
87 return 0;
88}
89
90static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
91 struct super_block *sb)
92{
93 struct inode *inode;
94
95 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
96 return ERR_PTR(-EXDEV);
97 if (!igrab(lower_inode))
98 return ERR_PTR(-ESTALE);
99 inode = iget5_locked(sb, (unsigned long)lower_inode,
100 ecryptfs_inode_test, ecryptfs_inode_set,
101 lower_inode);
102 if (!inode) {
103 iput(lower_inode);
104 return ERR_PTR(-EACCES);
105 }
106 if (!(inode->i_state & I_NEW))
107 iput(lower_inode);
108
109 return inode;
110}
111
112struct inode *ecryptfs_get_inode(struct inode *lower_inode,
113 struct super_block *sb)
114{
115 struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
116
117 if (!IS_ERR(inode) && (inode->i_state & I_NEW))
118 unlock_new_inode(inode);
119
120 return inode;
121}
122
123/**
124 * ecryptfs_interpose
125 * @lower_dentry: Existing dentry in the lower filesystem
126 * @dentry: ecryptfs' dentry
127 * @sb: ecryptfs's super_block
128 *
129 * Interposes upper and lower dentries.
130 *
131 * Returns zero on success; non-zero otherwise
132 */
133static int ecryptfs_interpose(struct dentry *lower_dentry,
134 struct dentry *dentry, struct super_block *sb)
135{
136 struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
137
138 if (IS_ERR(inode))
139 return PTR_ERR(inode);
140 d_instantiate(dentry, inode);
141
142 return 0;
143}
144
54/** 145/**
55 * ecryptfs_create_underlying_file 146 * ecryptfs_create_underlying_file
56 * @lower_dir_inode: inode of the parent in the lower fs of the new file 147 * @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
129 goto out_lock; 220 goto out_lock;
130 } 221 }
131 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 222 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
132 directory_inode->i_sb, 0); 223 directory_inode->i_sb);
133 if (rc) { 224 if (rc) {
134 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); 225 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
135 goto out_lock; 226 goto out_lock;
@@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
168 "context; rc = [%d]\n", rc); 259 "context; rc = [%d]\n", rc);
169 goto out; 260 goto out;
170 } 261 }
171 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 262 rc = ecryptfs_get_lower_file(ecryptfs_dentry,
263 ecryptfs_dentry->d_inode);
172 if (rc) { 264 if (rc) {
173 printk(KERN_ERR "%s: Error attempting to initialize " 265 printk(KERN_ERR "%s: Error attempting to initialize "
174 "the lower file for the dentry with name " 266 "the lower file for the dentry with name "
@@ -215,102 +307,90 @@ out:
215 return rc; 307 return rc;
216} 308}
217 309
310static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
311{
312 struct ecryptfs_crypt_stat *crypt_stat;
313 int rc;
314
315 rc = ecryptfs_get_lower_file(dentry, inode);
316 if (rc) {
317 printk(KERN_ERR "%s: Error attempting to initialize "
318 "the lower file for the dentry with name "
319 "[%s]; rc = [%d]\n", __func__,
320 dentry->d_name.name, rc);
321 return rc;
322 }
323
324 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
325 /* TODO: lock for crypt_stat comparison */
326 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
327 ecryptfs_set_default_sizes(crypt_stat);
328
329 rc = ecryptfs_read_and_validate_header_region(inode);
330 ecryptfs_put_lower_file(inode);
331 if (rc) {
332 rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
333 if (!rc)
334 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
335 }
336
337 /* Must return 0 to allow non-eCryptfs files to be looked up, too */
338 return 0;
339}
340
218/** 341/**
219 * ecryptfs_lookup_and_interpose_lower - Perform a lookup 342 * ecryptfs_lookup_interpose - Dentry interposition for a lookup
220 */ 343 */
221int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 344static int ecryptfs_lookup_interpose(struct dentry *dentry,
222 struct dentry *lower_dentry, 345 struct dentry *lower_dentry,
223 struct inode *ecryptfs_dir_inode) 346 struct inode *dir_inode)
224{ 347{
225 struct dentry *lower_dir_dentry; 348 struct inode *inode, *lower_inode = lower_dentry->d_inode;
349 struct ecryptfs_dentry_info *dentry_info;
226 struct vfsmount *lower_mnt; 350 struct vfsmount *lower_mnt;
227 struct inode *lower_inode; 351 int rc = 0;
228 struct ecryptfs_crypt_stat *crypt_stat; 352
229 char *page_virt = NULL; 353 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
230 int put_lower = 0, rc = 0; 354 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
231
232 lower_dir_dentry = lower_dentry->d_parent;
233 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
234 ecryptfs_dentry->d_parent));
235 lower_inode = lower_dentry->d_inode;
236 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
237 BUG_ON(!lower_dentry->d_count); 355 BUG_ON(!lower_dentry->d_count);
238 ecryptfs_set_dentry_private(ecryptfs_dentry, 356
239 kmem_cache_alloc(ecryptfs_dentry_info_cache, 357 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
240 GFP_KERNEL)); 358 ecryptfs_set_dentry_private(dentry, dentry_info);
241 if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { 359 if (!dentry_info) {
242 rc = -ENOMEM;
243 printk(KERN_ERR "%s: Out of memory whilst attempting " 360 printk(KERN_ERR "%s: Out of memory whilst attempting "
244 "to allocate ecryptfs_dentry_info struct\n", 361 "to allocate ecryptfs_dentry_info struct\n",
245 __func__); 362 __func__);
246 goto out_put; 363 dput(lower_dentry);
364 mntput(lower_mnt);
365 d_drop(dentry);
366 return -ENOMEM;
247 } 367 }
248 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); 368 ecryptfs_set_dentry_lower(dentry, lower_dentry);
249 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); 369 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
370
250 if (!lower_dentry->d_inode) { 371 if (!lower_dentry->d_inode) {
251 /* We want to add because we couldn't find in lower */ 372 /* We want to add because we couldn't find in lower */
252 d_add(ecryptfs_dentry, NULL); 373 d_add(dentry, NULL);
253 goto out; 374 return 0;
254 }
255 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
256 ecryptfs_dir_inode->i_sb,
257 ECRYPTFS_INTERPOSE_FLAG_D_ADD);
258 if (rc) {
259 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
260 __func__, rc);
261 goto out;
262 }
263 if (S_ISDIR(lower_inode->i_mode))
264 goto out;
265 if (S_ISLNK(lower_inode->i_mode))
266 goto out;
267 if (special_file(lower_inode->i_mode))
268 goto out;
269 /* Released in this function */
270 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
271 if (!page_virt) {
272 printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
273 __func__);
274 rc = -ENOMEM;
275 goto out;
276 } 375 }
277 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 376 inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
278 if (rc) { 377 if (IS_ERR(inode)) {
279 printk(KERN_ERR "%s: Error attempting to initialize " 378 printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
280 "the lower file for the dentry with name " 379 __func__, PTR_ERR(inode));
281 "[%s]; rc = [%d]\n", __func__, 380 return PTR_ERR(inode);
282 ecryptfs_dentry->d_name.name, rc);
283 goto out_free_kmem;
284 } 381 }
285 put_lower = 1; 382 if (S_ISREG(inode->i_mode)) {
286 crypt_stat = &ecryptfs_inode_to_private( 383 rc = ecryptfs_i_size_read(dentry, inode);
287 ecryptfs_dentry->d_inode)->crypt_stat;
288 /* TODO: lock for crypt_stat comparison */
289 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
290 ecryptfs_set_default_sizes(crypt_stat);
291 rc = ecryptfs_read_and_validate_header_region(page_virt,
292 ecryptfs_dentry->d_inode);
293 if (rc) {
294 memset(page_virt, 0, PAGE_CACHE_SIZE);
295 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
296 ecryptfs_dentry);
297 if (rc) { 384 if (rc) {
298 rc = 0; 385 make_bad_inode(inode);
299 goto out_free_kmem; 386 return rc;
300 } 387 }
301 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
302 } 388 }
303 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); 389
304out_free_kmem: 390 if (inode->i_state & I_NEW)
305 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 391 unlock_new_inode(inode);
306 goto out; 392 d_add(dentry, inode);
307out_put: 393
308 dput(lower_dentry);
309 mntput(lower_mnt);
310 d_drop(ecryptfs_dentry);
311out:
312 if (put_lower)
313 ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
314 return rc; 394 return rc;
315} 395}
316 396
@@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
353 goto out_d_drop; 433 goto out_d_drop;
354 } 434 }
355 if (lower_dentry->d_inode) 435 if (lower_dentry->d_inode)
356 goto lookup_and_interpose; 436 goto interpose;
357 mount_crypt_stat = &ecryptfs_superblock_to_private( 437 mount_crypt_stat = &ecryptfs_superblock_to_private(
358 ecryptfs_dentry->d_sb)->mount_crypt_stat; 438 ecryptfs_dentry->d_sb)->mount_crypt_stat;
359 if (!(mount_crypt_stat 439 if (!(mount_crypt_stat
360 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) 440 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
361 goto lookup_and_interpose; 441 goto interpose;
362 dput(lower_dentry); 442 dput(lower_dentry);
363 rc = ecryptfs_encrypt_and_encode_filename( 443 rc = ecryptfs_encrypt_and_encode_filename(
364 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, 444 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
381 encrypted_and_encoded_name); 461 encrypted_and_encoded_name);
382 goto out_d_drop; 462 goto out_d_drop;
383 } 463 }
384lookup_and_interpose: 464interpose:
385 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 465 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
386 ecryptfs_dir_inode); 466 ecryptfs_dir_inode);
387 goto out; 467 goto out;
388out_d_drop: 468out_d_drop:
389 d_drop(ecryptfs_dentry); 469 d_drop(ecryptfs_dentry);
@@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
411 lower_new_dentry); 491 lower_new_dentry);
412 if (rc || !lower_new_dentry->d_inode) 492 if (rc || !lower_new_dentry->d_inode)
413 goto out_lock; 493 goto out_lock;
414 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 494 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
415 if (rc) 495 if (rc)
416 goto out_lock; 496 goto out_lock;
417 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 497 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
478 kfree(encoded_symname); 558 kfree(encoded_symname);
479 if (rc || !lower_dentry->d_inode) 559 if (rc || !lower_dentry->d_inode)
480 goto out_lock; 560 goto out_lock;
481 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 561 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
482 if (rc) 562 if (rc)
483 goto out_lock; 563 goto out_lock;
484 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 564 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
502 rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); 582 rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
503 if (rc || !lower_dentry->d_inode) 583 if (rc || !lower_dentry->d_inode)
504 goto out; 584 goto out;
505 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 585 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
506 if (rc) 586 if (rc)
507 goto out; 587 goto out;
508 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 588 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
521 struct dentry *lower_dir_dentry; 601 struct dentry *lower_dir_dentry;
522 int rc; 602 int rc;
523 603
524 dentry_unhash(dentry);
525
526 lower_dentry = ecryptfs_dentry_to_lower(dentry); 604 lower_dentry = ecryptfs_dentry_to_lower(dentry);
527 dget(dentry); 605 dget(dentry);
528 lower_dir_dentry = lock_parent(lower_dentry); 606 lower_dir_dentry = lock_parent(lower_dentry);
@@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
552 rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); 630 rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
553 if (rc || !lower_dentry->d_inode) 631 if (rc || !lower_dentry->d_inode)
554 goto out; 632 goto out;
555 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 633 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
556 if (rc) 634 if (rc)
557 goto out; 635 goto out;
558 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 636 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
575 struct dentry *lower_new_dir_dentry; 653 struct dentry *lower_new_dir_dentry;
576 struct dentry *trap = NULL; 654 struct dentry *trap = NULL;
577 655
578 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
579 dentry_unhash(new_dentry);
580
581 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 656 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
582 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 657 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
583 dget(lower_old_dentry); 658 dget(lower_old_dentry);
@@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
755 lower_ia->ia_valid &= ~ATTR_SIZE; 830 lower_ia->ia_valid &= ~ATTR_SIZE;
756 return 0; 831 return 0;
757 } 832 }
758 rc = ecryptfs_get_lower_file(dentry); 833 rc = ecryptfs_get_lower_file(dentry, inode);
759 if (rc) 834 if (rc)
760 return rc; 835 return rc;
761 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 836 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
911 986
912 mount_crypt_stat = &ecryptfs_superblock_to_private( 987 mount_crypt_stat = &ecryptfs_superblock_to_private(
913 dentry->d_sb)->mount_crypt_stat; 988 dentry->d_sb)->mount_crypt_stat;
914 rc = ecryptfs_get_lower_file(dentry); 989 rc = ecryptfs_get_lower_file(dentry, inode);
915 if (rc) { 990 if (rc) {
916 mutex_unlock(&crypt_stat->cs_mutex); 991 mutex_unlock(&crypt_stat->cs_mutex);
917 goto out; 992 goto out;
@@ -1084,21 +1159,6 @@ out:
1084 return rc; 1159 return rc;
1085} 1160}
1086 1161
1087int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
1088{
1089 if ((ecryptfs_inode_to_lower(inode)
1090 == (struct inode *)candidate_lower_inode))
1091 return 1;
1092 else
1093 return 0;
1094}
1095
1096int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
1097{
1098 ecryptfs_init_inode(inode, (struct inode *)lower_inode);
1099 return 0;
1100}
1101
1102const struct inode_operations ecryptfs_symlink_iops = { 1162const struct inode_operations ecryptfs_symlink_iops = {
1103 .readlink = ecryptfs_readlink, 1163 .readlink = ecryptfs_readlink,
1104 .follow_link = ecryptfs_follow_link, 1164 .follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8e..9f1bb747d77d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
135 return rc; 135 return rc;
136} 136}
137 137
138int ecryptfs_get_lower_file(struct dentry *dentry) 138int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
139{ 139{
140 struct ecryptfs_inode_info *inode_info = 140 struct ecryptfs_inode_info *inode_info;
141 ecryptfs_inode_to_private(dentry->d_inode);
142 int count, rc = 0; 141 int count, rc = 0;
143 142
143 inode_info = ecryptfs_inode_to_private(inode);
144 mutex_lock(&inode_info->lower_file_mutex); 144 mutex_lock(&inode_info->lower_file_mutex);
145 count = atomic_inc_return(&inode_info->lower_file_count); 145 count = atomic_inc_return(&inode_info->lower_file_count);
146 if (WARN_ON_ONCE(count < 1)) 146 if (WARN_ON_ONCE(count < 1))
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
168 } 168 }
169} 169}
170 170
171static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
172 struct super_block *sb)
173{
174 struct inode *inode;
175 int rc = 0;
176
177 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
178 rc = -EXDEV;
179 goto out;
180 }
181 if (!igrab(lower_inode)) {
182 rc = -ESTALE;
183 goto out;
184 }
185 inode = iget5_locked(sb, (unsigned long)lower_inode,
186 ecryptfs_inode_test, ecryptfs_inode_set,
187 lower_inode);
188 if (!inode) {
189 rc = -EACCES;
190 iput(lower_inode);
191 goto out;
192 }
193 if (inode->i_state & I_NEW)
194 unlock_new_inode(inode);
195 else
196 iput(lower_inode);
197 if (S_ISLNK(lower_inode->i_mode))
198 inode->i_op = &ecryptfs_symlink_iops;
199 else if (S_ISDIR(lower_inode->i_mode))
200 inode->i_op = &ecryptfs_dir_iops;
201 if (S_ISDIR(lower_inode->i_mode))
202 inode->i_fop = &ecryptfs_dir_fops;
203 if (special_file(lower_inode->i_mode))
204 init_special_inode(inode, lower_inode->i_mode,
205 lower_inode->i_rdev);
206 fsstack_copy_attr_all(inode, lower_inode);
207 /* This size will be overwritten for real files w/ headers and
208 * other metadata */
209 fsstack_copy_inode_size(inode, lower_inode);
210 return inode;
211out:
212 return ERR_PTR(rc);
213}
214
215/**
216 * ecryptfs_interpose
217 * @lower_dentry: Existing dentry in the lower filesystem
218 * @dentry: ecryptfs' dentry
219 * @sb: ecryptfs's super_block
220 * @flags: flags to govern behavior of interpose procedure
221 *
222 * Interposes upper and lower dentries.
223 *
224 * Returns zero on success; non-zero otherwise
225 */
226int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
227 struct super_block *sb, u32 flags)
228{
229 struct inode *lower_inode = lower_dentry->d_inode;
230 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
231 if (IS_ERR(inode))
232 return PTR_ERR(inode);
233 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
234 d_add(dentry, inode);
235 else
236 d_instantiate(dentry, inode);
237 return 0;
238}
239
240enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, 171enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
241 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, 172 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
242 ecryptfs_opt_ecryptfs_key_bytes, 173 ecryptfs_opt_ecryptfs_key_bytes,
@@ -704,13 +635,8 @@ static struct ecryptfs_cache_info {
704 .size = sizeof(struct ecryptfs_sb_info), 635 .size = sizeof(struct ecryptfs_sb_info),
705 }, 636 },
706 { 637 {
707 .cache = &ecryptfs_header_cache_1, 638 .cache = &ecryptfs_header_cache,
708 .name = "ecryptfs_headers_1", 639 .name = "ecryptfs_headers",
709 .size = PAGE_CACHE_SIZE,
710 },
711 {
712 .cache = &ecryptfs_header_cache_2,
713 .name = "ecryptfs_headers_2",
714 .size = PAGE_CACHE_SIZE, 640 .size = PAGE_CACHE_SIZE,
715 }, 641 },
716 { 642 {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b6..dbd52d40df4c 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
93} 93}
94 94
95/** 95/**
96 * ecryptfs_init_inode
97 * @inode: The ecryptfs inode
98 *
99 * Set up the ecryptfs inode.
100 */
101void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
102{
103 ecryptfs_set_inode_lower(inode, lower_inode);
104 inode->i_ino = lower_inode->i_ino;
105 inode->i_version++;
106 inode->i_op = &ecryptfs_main_iops;
107 inode->i_fop = &ecryptfs_main_fops;
108 inode->i_mapping->a_ops = &ecryptfs_aops;
109}
110
111/**
112 * ecryptfs_statfs 96 * ecryptfs_statfs
113 * @sb: The ecryptfs super block 97 * @sb: The ecryptfs super block
114 * @buf: The struct kstatfs to fill in with stats 98 * @buf: The struct kstatfs to fill in with stats
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c35..3451d23c3bae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3392 * so would cause a commit on atime updates, which we don't bother doing. 3392 * so would cause a commit on atime updates, which we don't bother doing.
3393 * We handle synchronous inodes at the highest possible level. 3393 * We handle synchronous inodes at the highest possible level.
3394 */ 3394 */
3395void ext3_dirty_inode(struct inode *inode) 3395void ext3_dirty_inode(struct inode *inode, int flags)
3396{ 3396{
3397 handle_t *current_handle = ext3_journal_current_handle(); 3397 handle_t *current_handle = ext3_journal_current_handle();
3398 handle_t *handle; 3398 handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f90..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1813extern void ext4_evict_inode(struct inode *); 1813extern void ext4_evict_inode(struct inode *);
1814extern void ext4_clear_inode(struct inode *); 1814extern void ext4_clear_inode(struct inode *);
1815extern int ext4_sync_inode(handle_t *, struct inode *); 1815extern int ext4_sync_inode(handle_t *, struct inode *);
1816extern void ext4_dirty_inode(struct inode *); 1816extern void ext4_dirty_inode(struct inode *, int);
1817extern int ext4_change_inode_journal_flag(struct inode *, int); 1817extern int ext4_change_inode_journal_flag(struct inode *, int);
1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1819extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c64584..a5763e3505ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5733 * so would cause a commit on atime updates, which we don't bother doing. 5733 * so would cause a commit on atime updates, which we don't bother doing.
5734 * We handle synchronous inodes at the highest possible level. 5734 * We handle synchronous inodes at the highest possible level.
5735 */ 5735 */
5736void ext4_dirty_inode(struct inode *inode) 5736void ext4_dirty_inode(struct inode *inode, int flags)
5737{ 5737{
5738 handle_t *handle; 5738 handle_t *handle;
5739 5739
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272e..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326 struct fat_slot_info sinfo; 326 struct fat_slot_info sinfo;
327 int err; 327 int err;
328 328
329 dentry_unhash(dentry);
330
331 lock_super(sb); 329 lock_super(sb);
332 /* 330 /*
333 * Check whether the directory is not in use, then check 331 * Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
459 old_inode = old_dentry->d_inode; 457 old_inode = old_dentry->d_inode;
460 new_inode = new_dentry->d_inode; 458 new_inode = new_dentry->d_inode;
461 459
462 if (new_inode && S_ISDIR(new_inode->i_mode))
463 dentry_unhash(new_dentry);
464
465 err = fat_scan(old_dir, old_name, &old_sinfo); 460 err = fat_scan(old_dir, old_name, &old_sinfo);
466 if (err) { 461 if (err) {
467 err = -EIO; 462 err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36c..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
824 struct fat_slot_info sinfo; 824 struct fat_slot_info sinfo;
825 int err; 825 int err;
826 826
827 dentry_unhash(dentry);
828
829 lock_super(sb); 827 lock_super(sb);
830 828
831 err = fat_dir_empty(inode); 829 err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
933 int err, is_dir, update_dotdot, corrupt = 0; 931 int err, is_dir, update_dotdot, corrupt = 0;
934 struct super_block *sb = old_dir->i_sb; 932 struct super_block *sb = old_dir->i_sb;
935 933
936 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
937 dentry_unhash(new_dentry);
938
939 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 934 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
940 old_inode = old_dentry->d_inode; 935 old_inode = old_dentry->d_inode;
941 new_inode = new_dentry->d_inode; 936 new_inode = new_dentry->d_inode;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b5..0f015a0468de 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1007 * In short, make sure you hash any inodes _before_ you start marking 1007 * In short, make sure you hash any inodes _before_ you start marking
1008 * them dirty. 1008 * them dirty.
1009 * 1009 *
1010 * This function *must* be atomic for the I_DIRTY_PAGES case -
1011 * set_page_dirty() is called under spinlock in several places.
1012 *
1013 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 1010 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1014 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 1011 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
1015 * the kernel-internal blockdev inode represents the dirtying time of the 1012 * the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 */ 1025 */
1029 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1026 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1030 if (sb->s_op->dirty_inode) 1027 if (sb->s_op->dirty_inode)
1031 sb->s_op->dirty_inode(inode); 1028 sb->s_op->dirty_inode(inode, flags);
1032 } 1029 }
1033 1030
1034 /* 1031 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcfa..d50160714595 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
667 if (IS_ERR(req)) 667 if (IS_ERR(req))
668 return PTR_ERR(req); 668 return PTR_ERR(req);
669 669
670 dentry_unhash(entry);
671
672 req->in.h.opcode = FUSE_RMDIR; 670 req->in.h.opcode = FUSE_RMDIR;
673 req->in.h.nodeid = get_node_id(dir); 671 req->in.h.nodeid = get_node_id(dir);
674 req->in.numargs = 1; 672 req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
694 struct fuse_conn *fc = get_fuse_conn(olddir); 692 struct fuse_conn *fc = get_fuse_conn(olddir);
695 struct fuse_req *req = fuse_get_req(fc); 693 struct fuse_req *req = fuse_get_req(fc);
696 694
697 if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
698 dentry_unhash(newent);
699
700 if (IS_ERR(req)) 695 if (IS_ERR(req))
701 return PTR_ERR(req); 696 return PTR_ERR(req);
702 697
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c1..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
253 struct inode *inode = dentry->d_inode; 253 struct inode *inode = dentry->d_inode;
254 int res; 254 int res;
255 255
256 if (S_ISDIR(inode->i_mode))
257 dentry_unhash(dentry);
258
259 if (S_ISDIR(inode->i_mode) && inode->i_size != 2) 256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
260 return -ENOTEMPTY; 257 return -ENOTEMPTY;
261 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
286 283
287 /* Unlink destination if it already exists */ 284 /* Unlink destination if it already exists */
288 if (new_dentry->d_inode) { 285 if (new_dentry->d_inode) {
289 if (S_ISDIR(new_dentry->d_inode->i_mode))
290 dentry_unhash(new_dentry);
291
292 res = hfs_remove(new_dir, new_dentry); 286 res = hfs_remove(new_dir, new_dentry);
293 if (res) 287 if (res)
294 return res; 288 return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd0..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
370 struct inode *inode = dentry->d_inode; 370 struct inode *inode = dentry->d_inode;
371 int res; 371 int res;
372 372
373 dentry_unhash(dentry);
374
375 if (inode->i_size != 2) 373 if (inode->i_size != 2)
376 return -ENOTEMPTY; 374 return -ENOTEMPTY;
377 375
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
469 467
470 /* Unlink destination if it already exists */ 468 /* Unlink destination if it already exists */
471 if (new_dentry->d_inode) { 469 if (new_dentry->d_inode) {
472 if (S_ISDIR(new_dentry->d_inode->i_mode)) { 470 if (S_ISDIR(new_dentry->d_inode->i_mode))
473 dentry_unhash(new_dentry);
474 res = hfsplus_rmdir(new_dir, new_dentry); 471 res = hfsplus_rmdir(new_dir, new_dentry);
475 } else { 472 else
476 res = hfsplus_unlink(new_dir, new_dentry); 473 res = hfsplus_unlink(new_dir, new_dentry);
477 }
478 if (res) 474 if (res)
479 return res; 475 return res;
480 } 476 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e6903..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
683 char *file; 683 char *file;
684 int err; 684 int err;
685 685
686 dentry_unhash(dentry);
687
688 if ((file = dentry_name(dentry)) == NULL) 686 if ((file = dentry_name(dentry)) == NULL)
689 return -ENOMEM; 687 return -ENOMEM;
690 err = do_rmdir(file); 688 err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
738 char *from_name, *to_name; 736 char *from_name, *to_name;
739 int err; 737 int err;
740 738
741 if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
742 dentry_unhash(to);
743
744 if ((from_name = dentry_name(from)) == NULL) 739 if ((from_name = dentry_name(from)) == NULL)
745 return -ENOMEM; 740 return -ENOMEM;
746 if ((to_name = dentry_name(to)) == NULL) { 741 if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c0867..acf95dab2aac 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
439 int err; 439 int err;
440 int r; 440 int r;
441 441
442 dentry_unhash(dentry);
443
444 hpfs_adjust_length(name, &len); 442 hpfs_adjust_length(name, &len);
445 hpfs_lock(dir->i_sb); 443 hpfs_lock(dir->i_sb);
446 err = -ENOENT; 444 err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
535 struct fnode *fnode; 533 struct fnode *fnode;
536 int err; 534 int err;
537 535
538 if (new_inode && S_ISDIR(new_inode->i_mode))
539 dentry_unhash(new_dentry);
540
541 if ((err = hpfs_chk_name(new_name, &new_len))) return err; 536 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
542 err = 0; 537 err = 0;
543 hpfs_adjust_length(old_name, &old_len); 538 hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a1..0f7e88a7803f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
1/* 1/*
2 * linux/fs/inode.c
3 *
4 * (C) 1997 Linus Torvalds 2 * (C) 1997 Linus Torvalds
3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
5 */ 4 */
6
7#include <linux/fs.h> 5#include <linux/fs.h>
8#include <linux/mm.h> 6#include <linux/mm.h>
9#include <linux/dcache.h> 7#include <linux/dcache.h>
@@ -27,10 +25,11 @@
27#include <linux/prefetch.h> 25#include <linux/prefetch.h>
28#include <linux/ima.h> 26#include <linux/ima.h>
29#include <linux/cred.h> 27#include <linux/cred.h>
28#include <linux/buffer_head.h> /* for inode_has_buffers */
30#include "internal.h" 29#include "internal.h"
31 30
32/* 31/*
33 * inode locking rules. 32 * Inode locking rules:
34 * 33 *
35 * inode->i_lock protects: 34 * inode->i_lock protects:
36 * inode->i_state, inode->i_hash, __iget() 35 * inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
60 * inode_hash_lock 59 * inode_hash_lock
61 */ 60 */
62 61
63/*
64 * This is needed for the following functions:
65 * - inode_has_buffers
66 * - invalidate_bdev
67 *
68 * FIXME: remove all knowledge of the buffer layer from this file
69 */
70#include <linux/buffer_head.h>
71
72/*
73 * New inode.c implementation.
74 *
75 * This implementation has the basic premise of trying
76 * to be extremely low-overhead and SMP-safe, yet be
77 * simple enough to be "obviously correct".
78 *
79 * Famous last words.
80 */
81
82/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
83
84/* #define INODE_PARANOIA 1 */
85/* #define INODE_DEBUG 1 */
86
87/*
88 * Inode lookup is no longer as critical as it used to be:
89 * most of the lookups are going to be through the dcache.
90 */
91#define I_HASHBITS i_hash_shift
92#define I_HASHMASK i_hash_mask
93
94static unsigned int i_hash_mask __read_mostly; 62static unsigned int i_hash_mask __read_mostly;
95static unsigned int i_hash_shift __read_mostly; 63static unsigned int i_hash_shift __read_mostly;
96static struct hlist_head *inode_hashtable __read_mostly; 64static struct hlist_head *inode_hashtable __read_mostly;
97static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
98 66
99/*
100 * Each inode can be on two separate lists. One is
101 * the hash list of the inode, used for lookups. The
102 * other linked list is the "type" list:
103 * "in_use" - valid inode, i_count > 0, i_nlink > 0
104 * "dirty" - as "in_use" but also dirty
105 * "unused" - valid inode, i_count = 0
106 *
107 * A "dirty" list is maintained for each super block,
108 * allowing for low-overhead inode sync() operations.
109 */
110
111static LIST_HEAD(inode_lru); 67static LIST_HEAD(inode_lru);
112static DEFINE_SPINLOCK(inode_lru_lock); 68static DEFINE_SPINLOCK(inode_lru_lock);
113 69
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
424 380
425 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 381 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
426 L1_CACHE_BYTES; 382 L1_CACHE_BYTES;
427 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); 383 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
428 return tmp & I_HASHMASK; 384 return tmp & i_hash_mask;
429} 385}
430 386
431/** 387/**
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 9a1e86fc1362..4bca6a2e5c07 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -605,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
605 int ret; 605 int ret;
606 uint32_t now = get_seconds(); 606 uint32_t now = get_seconds();
607 607
608 dentry_unhash(dentry);
609
610 for (fd = f->dents ; fd; fd = fd->next) { 608 for (fd = f->dents ; fd; fd = fd->next) {
611 if (fd->ino) 609 if (fd->ino)
612 return -ENOTEMPTY; 610 return -ENOTEMPTY;
@@ -782,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
782 uint8_t type; 780 uint8_t type;
783 uint32_t now; 781 uint32_t now;
784 782
785 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
786 dentry_unhash(new_dentry);
787
788 /* The VFS will check for us and prevent trying to rename a 783 /* The VFS will check for us and prevent trying to rename a
789 * file over a directory and vice versa, but if it's a directory, 784 * file over a directory and vice versa, but if it's a directory,
790 * the VFS can't check whether the victim is empty. The filesystem 785 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767eb..46ad619b6124 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
357 return ERR_PTR(ret); 357 return ERR_PTR(ret);
358} 358}
359 359
360void jffs2_dirty_inode(struct inode *inode) 360void jffs2_dirty_inode(struct inode *inode, int flags)
361{ 361{
362 struct iattr iattr; 362 struct iattr iattr;
363 363
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e48..65c6c43ca482 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
172int jffs2_do_setattr (struct inode *, struct iattr *); 172int jffs2_do_setattr (struct inode *, struct iattr *);
173struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
174void jffs2_evict_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
175void jffs2_dirty_inode(struct inode *inode); 175void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209e..109655904bbc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
173 dquot_drop(inode); 173 dquot_drop(inode);
174} 174}
175 175
176void jfs_dirty_inode(struct inode *inode) 176void jfs_dirty_inode(struct inode *inode, int flags)
177{ 177{
178 static int noisy = 5; 178 static int noisy = 5;
179 179
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07d..ec2fb8b945fc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode *, struct writeback_control *); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_evict_inode(struct inode *); 30extern void jfs_evict_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *, int);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
33extern void jfs_truncate_nolock(struct inode *, loff_t); 33extern void jfs_truncate_nolock(struct inode *, loff_t);
34extern void jfs_free_zero_link(struct inode *); 34extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf3..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
360 360
361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
362 362
363 dentry_unhash(dentry);
364
365 /* Init inode for quota operations. */ 363 /* Init inode for quota operations. */
366 dquot_initialize(dip); 364 dquot_initialize(dip);
367 dquot_initialize(ip); 365 dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1095 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1098 new_dentry->d_name.name); 1096 new_dentry->d_name.name);
1099 1097
1100 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1101 dentry_unhash(new_dentry);
1102
1103 dquot_initialize(old_dir); 1098 dquot_initialize(old_dir);
1104 dquot_initialize(new_dir); 1099 dquot_initialize(new_dir);
1105 1100
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e94..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{ 273{
274 struct inode *inode = dentry->d_inode; 274 struct inode *inode = dentry->d_inode;
275 275
276 dentry_unhash(dentry);
277
278 if (!logfs_empty_dir(inode)) 276 if (!logfs_empty_dir(inode))
279 return -ENOTEMPTY; 277 return -ENOTEMPTY;
280 278
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
624 loff_t pos; 622 loff_t pos;
625 int err; 623 int err;
626 624
627 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
628 dentry_unhash(new_dentry);
629
630 /* 1. locate source dd */ 625 /* 1. locate source dd */
631 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); 626 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
632 if (err) 627 if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c4..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
168 struct inode * inode = dentry->d_inode; 168 struct inode * inode = dentry->d_inode;
169 int err = -ENOTEMPTY; 169 int err = -ENOTEMPTY;
170 170
171 dentry_unhash(dentry);
172
173 if (minix_empty_dir(inode)) { 171 if (minix_empty_dir(inode)) {
174 err = minix_unlink(dir, dentry); 172 err = minix_unlink(dir, dentry);
175 if (!err) { 173 if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
192 struct minix_dir_entry * old_de; 190 struct minix_dir_entry * old_de;
193 int err = -ENOENT; 191 int err = -ENOENT;
194 192
195 if (new_inode && S_ISDIR(new_inode->i_mode))
196 dentry_unhash(new_dentry);
197
198 old_de = minix_find_entry(old_dentry, &old_page); 193 old_de = minix_find_entry(old_dentry, &old_page);
199 if (!old_de) 194 if (!old_de)
200 goto out; 195 goto out;
diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b221..e2e4e8d032ee 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
919} 919}
920 920
921/* 921/*
922 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 922 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
923 * meet a managed dentry and we're not walking to "..". True is returned to 923 * we meet a managed dentry that would need blocking.
924 * continue, false to abort.
925 */ 924 */
926static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 925static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
927 struct inode **inode, bool reverse_transit) 926 struct inode **inode)
928{ 927{
929 for (;;) { 928 for (;;) {
930 struct vfsmount *mounted; 929 struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
933 * that wants to block transit. 932 * that wants to block transit.
934 */ 933 */
935 *inode = path->dentry->d_inode; 934 *inode = path->dentry->d_inode;
936 if (!reverse_transit && 935 if (unlikely(managed_dentry_might_block(path->dentry)))
937 unlikely(managed_dentry_might_block(path->dentry)))
938 return false; 936 return false;
939 937
940 if (!d_mountpoint(path->dentry)) 938 if (!d_mountpoint(path->dentry))
@@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
947 path->dentry = mounted->mnt_root; 945 path->dentry = mounted->mnt_root;
948 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 946 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
949 } 947 }
950
951 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
952 return reverse_transit;
953 return true; 948 return true;
954} 949}
955 950
956static int follow_dotdot_rcu(struct nameidata *nd) 951static void follow_mount_rcu(struct nameidata *nd)
957{ 952{
958 struct inode *inode = nd->inode; 953 while (d_mountpoint(nd->path.dentry)) {
954 struct vfsmount *mounted;
955 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
956 if (!mounted)
957 break;
958 nd->path.mnt = mounted;
959 nd->path.dentry = mounted->mnt_root;
960 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
961 }
962}
959 963
964static int follow_dotdot_rcu(struct nameidata *nd)
965{
960 set_root_rcu(nd); 966 set_root_rcu(nd);
961 967
962 while (1) { 968 while (1) {
@@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
972 seq = read_seqcount_begin(&parent->d_seq); 978 seq = read_seqcount_begin(&parent->d_seq);
973 if (read_seqcount_retry(&old->d_seq, nd->seq)) 979 if (read_seqcount_retry(&old->d_seq, nd->seq))
974 goto failed; 980 goto failed;
975 inode = parent->d_inode;
976 nd->path.dentry = parent; 981 nd->path.dentry = parent;
977 nd->seq = seq; 982 nd->seq = seq;
978 break; 983 break;
@@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
980 if (!follow_up_rcu(&nd->path)) 985 if (!follow_up_rcu(&nd->path))
981 break; 986 break;
982 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 987 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
983 inode = nd->path.dentry->d_inode;
984 } 988 }
985 __follow_mount_rcu(nd, &nd->path, &inode, true); 989 follow_mount_rcu(nd);
986 nd->inode = inode; 990 nd->inode = nd->path.dentry->d_inode;
987 return 0; 991 return 0;
988 992
989failed: 993failed:
@@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1157 } 1161 }
1158 path->mnt = mnt; 1162 path->mnt = mnt;
1159 path->dentry = dentry; 1163 path->dentry = dentry;
1160 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1164 if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1161 return 0; 1165 goto unlazy;
1166 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1167 goto unlazy;
1168 return 0;
1162unlazy: 1169unlazy:
1163 if (unlazy_walk(nd, dentry)) 1170 if (unlazy_walk(nd, dentry))
1164 return -ECHILD; 1171 return -ECHILD;
@@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2572 if (error) 2579 if (error)
2573 goto out; 2580 goto out;
2574 2581
2582 shrink_dcache_parent(dentry);
2575 error = dir->i_op->rmdir(dir, dentry); 2583 error = dir->i_op->rmdir(dir, dentry);
2576 if (error) 2584 if (error)
2577 goto out; 2585 goto out;
@@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2986 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 2994 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
2987 goto out; 2995 goto out;
2988 2996
2997 if (target)
2998 shrink_dcache_parent(new_dentry);
2989 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2999 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2990 if (error) 3000 if (error)
2991 goto out; 3001 goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b06404..9c51f621e901 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1033 DPRINTK("ncp_rmdir: removing %s/%s\n", 1033 DPRINTK("ncp_rmdir: removing %s/%s\n",
1034 dentry->d_parent->d_name.name, dentry->d_name.name); 1034 dentry->d_parent->d_name.name, dentry->d_name.name);
1035 1035
1036 /*
1037 * fail with EBUSY if there are still references to this
1038 * directory.
1039 */
1036 dentry_unhash(dentry); 1040 dentry_unhash(dentry);
1037
1038 error = -EBUSY; 1041 error = -EBUSY;
1039 if (!d_unhashed(dentry)) 1042 if (!d_unhashed(dentry))
1040 goto out; 1043 goto out;
@@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1141 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1144 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1142 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1145 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1143 1146
1144 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) 1147 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
1148 /*
1149 * fail with EBUSY if there are still references to this
1150 * directory.
1151 */
1145 dentry_unhash(new_dentry); 1152 dentry_unhash(new_dentry);
1153 error = -EBUSY;
1154 if (!d_unhashed(new_dentry))
1155 goto out;
1156 }
1146 1157
1147 ncp_age_dentry(server, old_dentry); 1158 ncp_age_dentry(server, old_dentry);
1148 ncp_age_dentry(server, new_dentry); 1159 ncp_age_dentry(server, new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6db..81515545ba75 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
87config PNFS_FILE_LAYOUT 87config PNFS_FILE_LAYOUT
88 tristate 88 tristate
89 89
90config PNFS_OBJLAYOUT
91 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
92 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
93 help
94 Say M here if you want your pNFS client to support the Objects Layout Driver.
95 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
96 upper level driver (SCSI_OSD_ULD).
97
98 If unsure, say N.
99
90config ROOT_NFS 100config ROOT_NFS
91 bool "Root file system on NFS" 101 bool "Root file system on NFS"
92 depends on NFS_FS=y && IP_PNP 102 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e3814..6a34f7dd0e6f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o 18nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
19nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21 21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311b..b257383bb565 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
167 167
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170
171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type;
173 uint32_t cbd_layout_type;
174 struct nfs4_deviceid cbd_dev_id;
175 uint32_t cbd_immediate;
176};
177
178struct cb_devicenotifyargs {
179 int ndevs;
180 struct cb_devicenotifyitem *devs;
181};
182
183extern __be32 nfs4_callback_devicenotify(
184 struct cb_devicenotifyargs *args,
185 void *dummy, struct cb_process_state *cps);
186
170#endif /* CONFIG_NFS_V4_1 */ 187#endif /* CONFIG_NFS_V4_1 */
171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); 188extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, 189extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18e..d4d1954e9bb9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
139 spin_lock(&ino->i_lock); 139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list, 141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode)) 142 &args->cbl_range))
143 rv = NFS4ERR_DELAY; 143 rv = NFS4ERR_DELAY;
144 else 144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT; 145 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
184 ino = lo->plh_inode; 184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock); 185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) 187 if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
241 do_callback_layoutrecall(clp, &args); 241 do_callback_layoutrecall(clp, &args);
242} 242}
243 243
244__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
245 void *dummy, struct cb_process_state *cps)
246{
247 int i;
248 __be32 res = 0;
249 struct nfs_client *clp = cps->clp;
250 struct nfs_server *server = NULL;
251
252 dprintk("%s: -->\n", __func__);
253
254 if (!clp) {
255 res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
256 goto out;
257 }
258
259 for (i = 0; i < args->ndevs; i++) {
260 struct cb_devicenotifyitem *dev = &args->devs[i];
261
262 if (!server ||
263 server->pnfs_curr_ld->id != dev->cbd_layout_type) {
264 rcu_read_lock();
265 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
266 if (server->pnfs_curr_ld &&
267 server->pnfs_curr_ld->id == dev->cbd_layout_type) {
268 rcu_read_unlock();
269 goto found;
270 }
271 rcu_read_unlock();
272 dprintk("%s: layout type %u not found\n",
273 __func__, dev->cbd_layout_type);
274 continue;
275 }
276
277 found:
278 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
279 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
280 "deleting instead\n", __func__);
281 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
282 }
283
284out:
285 kfree(args->devs);
286 dprintk("%s: exit with status = %u\n",
287 __func__, be32_to_cpu(res));
288 return res;
289}
290
244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 291int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
245{ 292{
246 if (delegation == NULL) 293 if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c1..c6c86a77e043 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
25 25
26#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 29#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
29 4 + 1 + 3) 30 4 + 1 + 3)
30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 31#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
284 return status; 285 return status;
285} 286}
286 287
288static
289__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
290 struct xdr_stream *xdr,
291 struct cb_devicenotifyargs *args)
292{
293 __be32 *p;
294 __be32 status = 0;
295 u32 tmp;
296 int n, i;
297 args->ndevs = 0;
298
299 /* Num of device notifications */
300 p = read_buf(xdr, sizeof(uint32_t));
301 if (unlikely(p == NULL)) {
302 status = htonl(NFS4ERR_BADXDR);
303 goto out;
304 }
305 n = ntohl(*p++);
306 if (n <= 0)
307 goto out;
308
309 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
310 if (!args->devs) {
311 status = htonl(NFS4ERR_DELAY);
312 goto out;
313 }
314
315 /* Decode each dev notification */
316 for (i = 0; i < n; i++) {
317 struct cb_devicenotifyitem *dev = &args->devs[i];
318
319 p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
320 if (unlikely(p == NULL)) {
321 status = htonl(NFS4ERR_BADXDR);
322 goto err;
323 }
324
325 tmp = ntohl(*p++); /* bitmap size */
326 if (tmp != 1) {
327 status = htonl(NFS4ERR_INVAL);
328 goto err;
329 }
330 dev->cbd_notify_type = ntohl(*p++);
331 if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
332 dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
333 status = htonl(NFS4ERR_INVAL);
334 goto err;
335 }
336
337 tmp = ntohl(*p++); /* opaque size */
338 if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
339 (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
340 ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
341 (tmp != NFS4_DEVICEID4_SIZE + 4))) {
342 status = htonl(NFS4ERR_INVAL);
343 goto err;
344 }
345 dev->cbd_layout_type = ntohl(*p++);
346 memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
347 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
348
349 if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
350 p = read_buf(xdr, sizeof(uint32_t));
351 if (unlikely(p == NULL)) {
352 status = htonl(NFS4ERR_BADXDR);
353 goto err;
354 }
355 dev->cbd_immediate = ntohl(*p++);
356 } else {
357 dev->cbd_immediate = 0;
358 }
359
360 args->ndevs++;
361
362 dprintk("%s: type %d layout 0x%x immediate %d\n",
363 __func__, dev->cbd_notify_type, dev->cbd_layout_type,
364 dev->cbd_immediate);
365 }
366out:
367 dprintk("%s: status %d ndevs %d\n",
368 __func__, ntohl(status), args->ndevs);
369 return status;
370err:
371 kfree(args->devs);
372 goto out;
373}
374
287static __be32 decode_sessionid(struct xdr_stream *xdr, 375static __be32 decode_sessionid(struct xdr_stream *xdr,
288 struct nfs4_sessionid *sid) 376 struct nfs4_sessionid *sid)
289{ 377{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
639 case OP_CB_RECALL_ANY: 727 case OP_CB_RECALL_ANY:
640 case OP_CB_RECALL_SLOT: 728 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL: 729 case OP_CB_LAYOUTRECALL:
730 case OP_CB_NOTIFY_DEVICEID:
642 *op = &callback_ops[op_nr]; 731 *op = &callback_ops[op_nr];
643 break; 732 break;
644 733
645 case OP_CB_NOTIFY_DEVICEID:
646 case OP_CB_NOTIFY: 734 case OP_CB_NOTIFY:
647 case OP_CB_PUSH_DELEG: 735 case OP_CB_PUSH_DELEG:
648 case OP_CB_RECALLABLE_OBJ_AVAIL: 736 case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
849 (callback_decode_arg_t)decode_layoutrecall_args, 937 (callback_decode_arg_t)decode_layoutrecall_args,
850 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, 938 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
851 }, 939 },
940 [OP_CB_NOTIFY_DEVICEID] = {
941 .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
942 .decode_args =
943 (callback_decode_arg_t)decode_devicenotify_args,
944 .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
945 },
852 [OP_CB_SEQUENCE] = { 946 [OP_CB_SEQUENCE] = {
853 .process_op = (callback_process_op_t)nfs4_callback_sequence, 947 .process_op = (callback_process_op_t)nfs4_callback_sequence,
854 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 948 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d80..b3dc2b88b65b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
290 if (clp->cl_machine_cred != NULL) 290 if (clp->cl_machine_cred != NULL)
291 put_rpccred(clp->cl_machine_cred); 291 put_rpccred(clp->cl_machine_cred);
292 292
293 nfs4_deviceid_purge_client(clp);
294
293 kfree(clp->cl_hostname); 295 kfree(clp->cl_hostname);
294 kfree(clp); 296 kfree(clp);
295 297
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2e..dd25c2aec375 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
21#include "delegation.h" 21#include "delegation.h"
22#include "internal.h" 22#include "internal.h"
23 23
24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{
26 kfree(delegation);
27}
28
29static void nfs_free_delegation_callback(struct rcu_head *head)
30{
31 struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
32
33 nfs_do_free_delegation(delegation);
34}
35
36static void nfs_free_delegation(struct nfs_delegation *delegation) 24static void nfs_free_delegation(struct nfs_delegation *delegation)
37{ 25{
38 if (delegation->cred) { 26 if (delegation->cred) {
39 put_rpccred(delegation->cred); 27 put_rpccred(delegation->cred);
40 delegation->cred = NULL; 28 delegation->cred = NULL;
41 } 29 }
42 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 30 kfree_rcu(delegation, rcu);
43} 31}
44 32
45/** 33/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a84..ededdbd0db38 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
512 struct page **xdr_pages, struct page *page, unsigned int buflen) 512 struct page **xdr_pages, struct page *page, unsigned int buflen)
513{ 513{
514 struct xdr_stream stream; 514 struct xdr_stream stream;
515 struct xdr_buf buf = { 515 struct xdr_buf buf;
516 .pages = xdr_pages,
517 .page_len = buflen,
518 .buflen = buflen,
519 .len = buflen,
520 };
521 struct page *scratch; 516 struct page *scratch;
522 struct nfs_cache_array *array; 517 struct nfs_cache_array *array;
523 unsigned int count = 0; 518 unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
527 if (scratch == NULL) 522 if (scratch == NULL)
528 return -ENOMEM; 523 return -ENOMEM;
529 524
530 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
531 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
532 527
533 do { 528 do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5e..144f2a3c7185 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1298 i_size_write(inode, new_isize); 1298 i_size_write(inode, new_isize);
1299 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1299 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1300 } 1300 }
1301 dprintk("NFS: isize change on server for file %s/%ld\n", 1301 dprintk("NFS: isize change on server for file %s/%ld "
1302 inode->i_sb->s_id, inode->i_ino); 1302 "(%Ld to %Ld)\n",
1303 inode->i_sb->s_id,
1304 inode->i_ino,
1305 (long long)cur_isize,
1306 (long long)new_isize);
1303 } 1307 }
1304 } else 1308 } else
1305 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR 1309 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1424 */ 1428 */
1425void nfs4_evict_inode(struct inode *inode) 1429void nfs4_evict_inode(struct inode *inode)
1426{ 1430{
1427 pnfs_destroy_layout(NFS_I(inode));
1428 truncate_inode_pages(&inode->i_data, 0); 1431 truncate_inode_pages(&inode->i_data, 0);
1429 end_writeback(inode); 1432 end_writeback(inode);
1433 pnfs_return_layout(inode);
1434 pnfs_destroy_layout(NFS_I(inode));
1430 /* If we are holding a delegation, return it! */ 1435 /* If we are holding a delegation, return it! */
1431 nfs_inode_return_delegation_noreclaim(inode); 1436 nfs_inode_return_delegation_noreclaim(inode);
1432 /* First call standard NFS clear_inode() code */ 1437 /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b5898..b9056cbe68d6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
310#endif 310#endif
311 311
312/* nfs4proc.c */ 312/* nfs4proc.c */
313extern void __nfs4_read_done_cb(struct nfs_read_data *);
313extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); 314extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
314extern int nfs4_init_client(struct nfs_client *clp, 315extern int nfs4_init_client(struct nfs_client *clp,
315 const struct rpc_timeout *timeparms, 316 const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386d..426908809c97 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
421 struct nfs4_deviceid *id, 421 struct nfs4_deviceid *id,
422 gfp_t gfp_flags) 422 gfp_t gfp_flags)
423{ 423{
424 struct nfs4_deviceid_node *d;
424 struct nfs4_file_layout_dsaddr *dsaddr; 425 struct nfs4_file_layout_dsaddr *dsaddr;
425 int status = -EINVAL; 426 int status = -EINVAL;
426 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); 427 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
428 dprintk("--> %s\n", __func__); 429 dprintk("--> %s\n", __func__);
429 430
430 if (fl->pattern_offset > lgr->range.offset) { 431 if (fl->pattern_offset > lgr->range.offset) {
431 dprintk("%s pattern_offset %lld to large\n", 432 dprintk("%s pattern_offset %lld too large\n",
432 __func__, fl->pattern_offset); 433 __func__, fl->pattern_offset);
433 goto out; 434 goto out;
434 } 435 }
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
440 } 441 }
441 442
442 /* find and reference the deviceid */ 443 /* find and reference the deviceid */
443 dsaddr = nfs4_fl_find_get_deviceid(id); 444 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
444 if (dsaddr == NULL) { 445 NFS_SERVER(lo->plh_inode)->nfs_client, id);
446 if (d == NULL) {
445 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); 447 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
446 if (dsaddr == NULL) 448 if (dsaddr == NULL)
447 goto out; 449 goto out;
448 } 450 } else
451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
449 fl->dsaddr = dsaddr; 452 fl->dsaddr = dsaddr;
450 453
451 if (fl->first_stripe_index < 0 || 454 if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
507 gfp_t gfp_flags) 510 gfp_t gfp_flags)
508{ 511{
509 struct xdr_stream stream; 512 struct xdr_stream stream;
510 struct xdr_buf buf = { 513 struct xdr_buf buf;
511 .pages = lgr->layoutp->pages,
512 .page_len = lgr->layoutp->len,
513 .buflen = lgr->layoutp->len,
514 .len = lgr->layoutp->len,
515 };
516 struct page *scratch; 514 struct page *scratch;
517 __be32 *p; 515 __be32 *p;
518 uint32_t nfl_util; 516 uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
524 if (!scratch) 522 if (!scratch)
525 return -ENOMEM; 523 return -ENOMEM;
526 524
527 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
528 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
529 527
530 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), 528 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
535 533
536 memcpy(id, p, sizeof(*id)); 534 memcpy(id, p, sizeof(*id));
537 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 535 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
538 print_deviceid(id); 536 nfs4_print_deviceid(id);
539 537
540 nfl_util = be32_to_cpup(p++); 538 nfl_util = be32_to_cpup(p++);
541 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) 539 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
653/* 651/*
654 * filelayout_pg_test(). Called by nfs_can_coalesce_requests() 652 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
655 * 653 *
656 * return 1 : coalesce page 654 * return true : coalesce page
657 * return 0 : don't coalesce page 655 * return false : don't coalesce page
658 */ 656 */
659int 657bool
660filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 658filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
661 struct nfs_page *req) 659 struct nfs_page *req)
662{ 660{
663 u64 p_stripe, r_stripe; 661 u64 p_stripe, r_stripe;
664 u32 stripe_unit; 662 u32 stripe_unit;
665 663
664 if (!pnfs_generic_pg_test(pgio, prev, req))
665 return 0;
666
666 if (!pgio->pg_lseg) 667 if (!pgio->pg_lseg)
667 return 1; 668 return 1;
668 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; 669 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
860 return -ENOMEM; 861 return -ENOMEM;
861} 862}
862 863
864static void
865filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
866{
867 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
868}
869
863static struct pnfs_layoutdriver_type filelayout_type = { 870static struct pnfs_layoutdriver_type filelayout_type = {
864 .id = LAYOUT_NFSV4_1_FILES, 871 .id = LAYOUT_NFSV4_1_FILES,
865 .name = "LAYOUT_NFSV4_1_FILES", 872 .name = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
872 .commit_pagelist = filelayout_commit_pagelist, 879 .commit_pagelist = filelayout_commit_pagelist,
873 .read_pagelist = filelayout_read_pagelist, 880 .read_pagelist = filelayout_read_pagelist,
874 .write_pagelist = filelayout_write_pagelist, 881 .write_pagelist = filelayout_write_pagelist,
882 .free_deviceid_node = filelayout_free_deveiceid_node,
875}; 883};
876 884
877static int __init nfs4filelayout_init(void) 885static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43a..cebe01e3795e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60 60
61struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
62 struct hlist_node node; 62 struct nfs4_deviceid_node id_node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags; 63 unsigned long flags;
66 u32 stripe_count; 64 u32 stripe_count;
67 u8 *stripe_indices; 65 u8 *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
95nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
96 94
97extern void print_ds(struct nfs4_pnfs_ds *ds); 95extern void print_ds(struct nfs4_pnfs_ds *ds);
98extern void print_deviceid(struct nfs4_deviceid *dev_id);
99u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 96u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
100u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 97u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
101struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 98struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
102 u32 ds_idx); 99 u32 ds_idx);
103extern struct nfs4_file_layout_dsaddr *
104nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
105extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 100extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
101extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
106struct nfs4_file_layout_dsaddr * 102struct nfs4_file_layout_dsaddr *
107get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 103get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
108 104
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af1395..3b7bf1377264 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
64 * Data server cache 40 * Data server cache
65 * 41 *
66 * Data servers can be mapped to different device ids. 42 * Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
89 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); 65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
90} 66}
91 67
92void
93print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
94{
95 int i;
96
97 ifdebug(FACILITY) {
98 printk("%s dsaddr->ds_num %d\n", __func__,
99 dsaddr->ds_num);
100 for (i = 0; i < dsaddr->ds_num; i++)
101 print_ds(dsaddr->ds_list[i]);
102 }
103}
104
105void print_deviceid(struct nfs4_deviceid *id)
106{
107 u32 *p = (u32 *)id;
108
109 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
110 p[0], p[1], p[2], p[3]);
111}
112
113/* nfs4_ds_cache_lock is held */ 68/* nfs4_ds_cache_lock is held */
114static struct nfs4_pnfs_ds * 69static struct nfs4_pnfs_ds *
115_data_server_lookup_locked(u32 ip_addr, u32 port) 70_data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
201 kfree(ds); 156 kfree(ds);
202} 157}
203 158
204static void 159void
205nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 160nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
206{ 161{
207 struct nfs4_pnfs_ds *ds; 162 struct nfs4_pnfs_ds *ds;
208 int i; 163 int i;
209 164
210 print_deviceid(&dsaddr->deviceid); 165 nfs4_print_deviceid(&dsaddr->id_node.deviceid);
211 166
212 for (i = 0; i < dsaddr->ds_num; i++) { 167 for (i = 0; i < dsaddr->ds_num; i++) {
213 ds = dsaddr->ds_list[i]; 168 ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
353 u8 max_stripe_index; 308 u8 max_stripe_index;
354 struct nfs4_file_layout_dsaddr *dsaddr = NULL; 309 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
355 struct xdr_stream stream; 310 struct xdr_stream stream;
356 struct xdr_buf buf = { 311 struct xdr_buf buf;
357 .pages = pdev->pages,
358 .page_len = pdev->pglen,
359 .buflen = pdev->pglen,
360 .len = pdev->pglen,
361 };
362 struct page *scratch; 312 struct page *scratch;
363 313
364 /* set up xdr stream */ 314 /* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
366 if (!scratch) 316 if (!scratch)
367 goto out_err; 317 goto out_err;
368 318
369 xdr_init_decode(&stream, &buf, NULL); 319 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
370 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 320 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
371 321
372 /* Get the stripe count (number of stripe index) */ 322 /* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
431 dsaddr->stripe_indices = stripe_indices; 381 dsaddr->stripe_indices = stripe_indices;
432 stripe_indices = NULL; 382 stripe_indices = NULL;
433 dsaddr->ds_num = num; 383 dsaddr->ds_num = num;
434 384 nfs4_init_deviceid_node(&dsaddr->id_node,
435 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); 385 NFS_SERVER(ino)->pnfs_curr_ld,
386 NFS_SERVER(ino)->nfs_client,
387 &pdev->dev_id);
436 388
437 for (i = 0; i < dsaddr->ds_num; i++) { 389 for (i = 0; i < dsaddr->ds_num; i++) {
438 int j; 390 int j;
@@ -505,8 +457,8 @@ out_err:
505static struct nfs4_file_layout_dsaddr * 457static struct nfs4_file_layout_dsaddr *
506decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) 458decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
507{ 459{
508 struct nfs4_file_layout_dsaddr *d, *new; 460 struct nfs4_deviceid_node *d;
509 long hash; 461 struct nfs4_file_layout_dsaddr *n, *new;
510 462
511 new = decode_device(inode, dev, gfp_flags); 463 new = decode_device(inode, dev, gfp_flags);
512 if (!new) { 464 if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
515 return NULL; 467 return NULL;
516 } 468 }
517 469
518 spin_lock(&filelayout_deviceid_lock); 470 d = nfs4_insert_deviceid_node(&new->id_node);
519 d = nfs4_fl_find_get_deviceid(&new->deviceid); 471 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
520 if (d) { 472 if (n != new) {
521 spin_unlock(&filelayout_deviceid_lock);
522 nfs4_fl_free_deviceid(new); 473 nfs4_fl_free_deviceid(new);
523 return d; 474 return n;
524 } 475 }
525 476
526 INIT_HLIST_NODE(&new->node);
527 atomic_set(&new->ref, 1);
528 hash = nfs4_fl_deviceid_hash(&new->deviceid);
529 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
530 spin_unlock(&filelayout_deviceid_lock);
531
532 return new; 477 return new;
533} 478}
534 479
@@ -600,35 +545,7 @@ out_free:
600void 545void
601nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 546nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
602{ 547{
603 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { 548 nfs4_put_deviceid_node(&dsaddr->id_node);
604 hlist_del_rcu(&dsaddr->node);
605 spin_unlock(&filelayout_deviceid_lock);
606
607 synchronize_rcu();
608 nfs4_fl_free_deviceid(dsaddr);
609 }
610}
611
612struct nfs4_file_layout_dsaddr *
613nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
614{
615 struct nfs4_file_layout_dsaddr *d;
616 struct hlist_node *n;
617 long hash = nfs4_fl_deviceid_hash(id);
618
619
620 rcu_read_lock();
621 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
622 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
623 if (!atomic_inc_not_zero(&d->ref))
624 goto fail;
625 rcu_read_unlock();
626 return d;
627 }
628 }
629fail:
630 rcu_read_unlock();
631 return NULL;
632} 549}
633 550
634/* 551/*
@@ -676,15 +593,15 @@ static void
676filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, 593filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
677 int err, u32 ds_addr) 594 int err, u32 ds_addr)
678{ 595{
679 u32 *p = (u32 *)&dsaddr->deviceid; 596 u32 *p = (u32 *)&dsaddr->id_node.deviceid;
680 597
681 printk(KERN_ERR "NFS: data server %x connection error %d." 598 printk(KERN_ERR "NFS: data server %x connection error %d."
682 " Deviceid [%x%x%x%x] marked out of use.\n", 599 " Deviceid [%x%x%x%x] marked out of use.\n",
683 ds_addr, err, p[0], p[1], p[2], p[3]); 600 ds_addr, err, p[0], p[1], p[2], p[3]);
684 601
685 spin_lock(&filelayout_deviceid_lock); 602 spin_lock(&nfs4_ds_cache_lock);
686 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; 603 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
687 spin_unlock(&filelayout_deviceid_lock); 604 spin_unlock(&nfs4_ds_cache_lock);
688} 605}
689 606
690struct nfs4_pnfs_ds * 607struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c3937..d2c4b59c896d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
267 break; 267 break;
268 nfs4_schedule_stateid_recovery(server, state); 268 nfs4_schedule_stateid_recovery(server, state);
269 goto wait_on_recovery; 269 goto wait_on_recovery;
270 case -NFS4ERR_EXPIRED:
271 if (state != NULL)
272 nfs4_schedule_stateid_recovery(server, state);
270 case -NFS4ERR_STALE_STATEID: 273 case -NFS4ERR_STALE_STATEID:
271 case -NFS4ERR_STALE_CLIENTID: 274 case -NFS4ERR_STALE_CLIENTID:
272 case -NFS4ERR_EXPIRED:
273 nfs4_schedule_lease_recovery(clp); 275 nfs4_schedule_lease_recovery(clp);
274 goto wait_on_recovery; 276 goto wait_on_recovery;
275#if defined(CONFIG_NFS_V4_1) 277#if defined(CONFIG_NFS_V4_1)
@@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2361 struct nfs4_state *state = NULL; 2363 struct nfs4_state *state = NULL;
2362 int status; 2364 int status;
2363 2365
2366 if (pnfs_ld_layoutret_on_setattr(inode))
2367 pnfs_return_layout(inode);
2368
2364 nfs_fattr_init(fattr); 2369 nfs_fattr_init(fattr);
2365 2370
2366 /* Search for an existing open(O_WRITE) file */ 2371 /* Search for an existing open(O_WRITE) file */
@@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3175 return err; 3180 return err;
3176} 3181}
3177 3182
3183void __nfs4_read_done_cb(struct nfs_read_data *data)
3184{
3185 nfs_invalidate_atime(data->inode);
3186}
3187
3178static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) 3188static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3179{ 3189{
3180 struct nfs_server *server = NFS_SERVER(data->inode); 3190 struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3184 return -EAGAIN; 3194 return -EAGAIN;
3185 } 3195 }
3186 3196
3187 nfs_invalidate_atime(data->inode); 3197 __nfs4_read_done_cb(data);
3188 if (task->tk_status > 0) 3198 if (task->tk_status > 0)
3189 renew_lease(server, data->timestamp); 3199 renew_lease(server, data->timestamp);
3190 return 0; 3200 return 0;
@@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3198 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3208 if (!nfs4_sequence_done(task, &data->res.seq_res))
3199 return -EAGAIN; 3209 return -EAGAIN;
3200 3210
3201 return data->read_done_cb(task, data); 3211 return data->read_done_cb ? data->read_done_cb(task, data) :
3212 nfs4_read_done_cb(task, data);
3202} 3213}
3203 3214
3204static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3215static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3243{ 3254{
3244 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3255 if (!nfs4_sequence_done(task, &data->res.seq_res))
3245 return -EAGAIN; 3256 return -EAGAIN;
3246 return data->write_done_cb(task, data); 3257 return data->write_done_cb ? data->write_done_cb(task, data) :
3258 nfs4_write_done_cb(task, data);
3247} 3259}
3248 3260
3249/* Reset the the nfs_write_data to send the write to the MDS. */ 3261/* Reset the the nfs_write_data to send the write to the MDS. */
@@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3670 break; 3682 break;
3671 nfs4_schedule_stateid_recovery(server, state); 3683 nfs4_schedule_stateid_recovery(server, state);
3672 goto wait_on_recovery; 3684 goto wait_on_recovery;
3685 case -NFS4ERR_EXPIRED:
3686 if (state != NULL)
3687 nfs4_schedule_stateid_recovery(server, state);
3673 case -NFS4ERR_STALE_STATEID: 3688 case -NFS4ERR_STALE_STATEID:
3674 case -NFS4ERR_STALE_CLIENTID: 3689 case -NFS4ERR_STALE_CLIENTID:
3675 case -NFS4ERR_EXPIRED:
3676 nfs4_schedule_lease_recovery(clp); 3690 nfs4_schedule_lease_recovery(clp);
3677 goto wait_on_recovery; 3691 goto wait_on_recovery;
3678#if defined(CONFIG_NFS_V4_1) 3692#if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4543 case -ESTALE: 4557 case -ESTALE:
4544 goto out; 4558 goto out;
4545 case -NFS4ERR_EXPIRED: 4559 case -NFS4ERR_EXPIRED:
4560 nfs4_schedule_stateid_recovery(server, state);
4546 case -NFS4ERR_STALE_CLIENTID: 4561 case -NFS4ERR_STALE_CLIENTID:
4547 case -NFS4ERR_STALE_STATEID: 4562 case -NFS4ERR_STALE_STATEID:
4548 nfs4_schedule_lease_recovery(server->nfs_client); 4563 nfs4_schedule_lease_recovery(server->nfs_client);
@@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5666 return status; 5681 return status;
5667} 5682}
5668 5683
5684static void
5685nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
5686{
5687 struct nfs4_layoutreturn *lrp = calldata;
5688
5689 dprintk("--> %s\n", __func__);
5690 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
5691 &lrp->res.seq_res, 0, task))
5692 return;
5693 rpc_call_start(task);
5694}
5695
5696static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5697{
5698 struct nfs4_layoutreturn *lrp = calldata;
5699 struct nfs_server *server;
5700
5701 dprintk("--> %s\n", __func__);
5702
5703 if (!nfs4_sequence_done(task, &lrp->res.seq_res))
5704 return;
5705
5706 server = NFS_SERVER(lrp->args.inode);
5707 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5708 nfs_restart_rpc(task, lrp->clp);
5709 return;
5710 }
5711 if (task->tk_status == 0) {
5712 struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
5713
5714 if (lrp->res.lrs_present) {
5715 spin_lock(&lo->plh_inode->i_lock);
5716 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
5717 spin_unlock(&lo->plh_inode->i_lock);
5718 } else
5719 BUG_ON(!list_empty(&lo->plh_segs));
5720 }
5721 dprintk("<-- %s\n", __func__);
5722}
5723
5724static void nfs4_layoutreturn_release(void *calldata)
5725{
5726 struct nfs4_layoutreturn *lrp = calldata;
5727
5728 dprintk("--> %s\n", __func__);
5729 put_layout_hdr(NFS_I(lrp->args.inode)->layout);
5730 kfree(calldata);
5731 dprintk("<-- %s\n", __func__);
5732}
5733
5734static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
5735 .rpc_call_prepare = nfs4_layoutreturn_prepare,
5736 .rpc_call_done = nfs4_layoutreturn_done,
5737 .rpc_release = nfs4_layoutreturn_release,
5738};
5739
5740int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5741{
5742 struct rpc_task *task;
5743 struct rpc_message msg = {
5744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
5745 .rpc_argp = &lrp->args,
5746 .rpc_resp = &lrp->res,
5747 };
5748 struct rpc_task_setup task_setup_data = {
5749 .rpc_client = lrp->clp->cl_rpcclient,
5750 .rpc_message = &msg,
5751 .callback_ops = &nfs4_layoutreturn_call_ops,
5752 .callback_data = lrp,
5753 };
5754 int status;
5755
5756 dprintk("--> %s\n", __func__);
5757 task = rpc_run_task(&task_setup_data);
5758 if (IS_ERR(task))
5759 return PTR_ERR(task);
5760 status = task->tk_status;
5761 dprintk("<-- %s status=%d\n", __func__, status);
5762 rpc_put_task(task);
5763 return status;
5764}
5765
5669static int 5766static int
5670_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5767_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5671{ 5768{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1f..e97dd219f84f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1466#ifdef CONFIG_NFS_V4_1 1466#ifdef CONFIG_NFS_V4_1
1467void nfs4_schedule_session_recovery(struct nfs4_session *session) 1467void nfs4_schedule_session_recovery(struct nfs4_session *session)
1468{ 1468{
1469 nfs4_schedule_lease_recovery(session->clp); 1469 struct nfs_client *clp = session->clp;
1470
1471 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1472 nfs4_schedule_lease_recovery(clp);
1470} 1473}
1471EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); 1474EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1472 1475
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
1549 status = nfs4_recovery_handle_error(clp, status); 1552 status = nfs4_recovery_handle_error(clp, status);
1550 goto out; 1553 goto out;
1551 } 1554 }
1555 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1552 /* create_session negotiated new slot table */ 1556 /* create_session negotiated new slot table */
1553 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1557 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1554 1558
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c46834..d869a5e5464b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
338 1 /* layoutupdate4 layout type */ + \ 338 1 /* layoutupdate4 layout type */ + \
339 1 /* NULL filelayout layoutupdate4 payload */) 339 1 /* NULL filelayout layoutupdate4 payload */)
340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
341 341#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
342 encode_stateid_maxsz + \
343 1 /* FIXME: opaque lrf_body always empty at the moment */)
344#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
345 1 + decode_stateid_maxsz)
342#else /* CONFIG_NFS_V4_1 */ 346#else /* CONFIG_NFS_V4_1 */
343#define encode_sequence_maxsz 0 347#define encode_sequence_maxsz 0
344#define decode_sequence_maxsz 0 348#define decode_sequence_maxsz 0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
760 decode_putfh_maxsz + \ 764 decode_putfh_maxsz + \
761 decode_layoutcommit_maxsz + \ 765 decode_layoutcommit_maxsz + \
762 decode_getattr_maxsz) 766 decode_getattr_maxsz)
763 767#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
768 encode_sequence_maxsz + \
769 encode_putfh_maxsz + \
770 encode_layoutreturn_maxsz)
771#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
772 decode_sequence_maxsz + \
773 decode_putfh_maxsz + \
774 decode_layoutreturn_maxsz)
764 775
765const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 776const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
766 compound_encode_hdr_maxsz + 777 compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
1864 1875
1865static int 1876static int
1866encode_layoutcommit(struct xdr_stream *xdr, 1877encode_layoutcommit(struct xdr_stream *xdr,
1878 struct inode *inode,
1867 const struct nfs4_layoutcommit_args *args, 1879 const struct nfs4_layoutcommit_args *args,
1868 struct compound_hdr *hdr) 1880 struct compound_hdr *hdr)
1869{ 1881{
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1872 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, 1884 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1873 NFS_SERVER(args->inode)->pnfs_curr_ld->id); 1885 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1874 1886
1875 p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); 1887 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1876 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1888 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1877 /* Only whole file layouts */ 1889 /* Only whole file layouts */
1878 p = xdr_encode_hyper(p, 0); /* offset */ 1890 p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
1883 p = xdr_encode_hyper(p, args->lastbytewritten); 1895 p = xdr_encode_hyper(p, args->lastbytewritten);
1884 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1896 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1885 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1897 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1886 *p++ = cpu_to_be32(0); /* no file layout payload */ 1898
1899 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
1900 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
1901 NFS_I(inode)->layout, xdr, args);
1902 else {
1903 p = reserve_space(xdr, 4);
1904 *p = cpu_to_be32(0); /* no layout-type payload */
1905 }
1887 1906
1888 hdr->nops++; 1907 hdr->nops++;
1889 hdr->replen += decode_layoutcommit_maxsz; 1908 hdr->replen += decode_layoutcommit_maxsz;
1890 return 0; 1909 return 0;
1891} 1910}
1911
1912static void
1913encode_layoutreturn(struct xdr_stream *xdr,
1914 const struct nfs4_layoutreturn_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918
1919 p = reserve_space(xdr, 20);
1920 *p++ = cpu_to_be32(OP_LAYOUTRETURN);
1921 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
1922 *p++ = cpu_to_be32(args->layout_type);
1923 *p++ = cpu_to_be32(IOMODE_ANY);
1924 *p = cpu_to_be32(RETURN_FILE);
1925 p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
1926 p = xdr_encode_hyper(p, 0);
1927 p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
1928 spin_lock(&args->inode->i_lock);
1929 xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1930 spin_unlock(&args->inode->i_lock);
1931 if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
1932 NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
1933 NFS_I(args->inode)->layout, xdr, args);
1934 } else {
1935 p = reserve_space(xdr, 4);
1936 *p = cpu_to_be32(0);
1937 }
1938 hdr->nops++;
1939 hdr->replen += decode_layoutreturn_maxsz;
1940}
1892#endif /* CONFIG_NFS_V4_1 */ 1941#endif /* CONFIG_NFS_V4_1 */
1893 1942
1894/* 1943/*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2706/* 2755/*
2707 * Encode LAYOUTCOMMIT request 2756 * Encode LAYOUTCOMMIT request
2708 */ 2757 */
2709static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, 2758static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2710 struct xdr_stream *xdr, 2759 struct xdr_stream *xdr,
2711 struct nfs4_layoutcommit_args *args) 2760 struct nfs4_layoutcommit_args *args)
2712{ 2761{
2762 struct nfs4_layoutcommit_data *data =
2763 container_of(args, struct nfs4_layoutcommit_data, args);
2713 struct compound_hdr hdr = { 2764 struct compound_hdr hdr = {
2714 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2765 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2715 }; 2766 };
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2717 encode_compound_hdr(xdr, req, &hdr); 2768 encode_compound_hdr(xdr, req, &hdr);
2718 encode_sequence(xdr, &args->seq_args, &hdr); 2769 encode_sequence(xdr, &args->seq_args, &hdr);
2719 encode_putfh(xdr, NFS_FH(args->inode), &hdr); 2770 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2720 encode_layoutcommit(xdr, args, &hdr); 2771 encode_layoutcommit(xdr, data->args.inode, args, &hdr);
2721 encode_getfattr(xdr, args->bitmask, &hdr); 2772 encode_getfattr(xdr, args->bitmask, &hdr);
2722 encode_nops(&hdr); 2773 encode_nops(&hdr);
2723 return 0; 2774}
2775
2776/*
2777 * Encode LAYOUTRETURN request
2778 */
2779static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
2780 struct xdr_stream *xdr,
2781 struct nfs4_layoutreturn_args *args)
2782{
2783 struct compound_hdr hdr = {
2784 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2785 };
2786
2787 encode_compound_hdr(xdr, req, &hdr);
2788 encode_sequence(xdr, &args->seq_args, &hdr);
2789 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2790 encode_layoutreturn(xdr, args, &hdr);
2791 encode_nops(&hdr);
2724} 2792}
2725#endif /* CONFIG_NFS_V4_1 */ 2793#endif /* CONFIG_NFS_V4_1 */
2726 2794
@@ -5203,6 +5271,27 @@ out_overflow:
5203 return -EIO; 5271 return -EIO;
5204} 5272}
5205 5273
5274static int decode_layoutreturn(struct xdr_stream *xdr,
5275 struct nfs4_layoutreturn_res *res)
5276{
5277 __be32 *p;
5278 int status;
5279
5280 status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
5281 if (status)
5282 return status;
5283 p = xdr_inline_decode(xdr, 4);
5284 if (unlikely(!p))
5285 goto out_overflow;
5286 res->lrs_present = be32_to_cpup(p);
5287 if (res->lrs_present)
5288 status = decode_stateid(xdr, &res->stateid);
5289 return status;
5290out_overflow:
5291 print_overflow_msg(__func__, xdr);
5292 return -EIO;
5293}
5294
5206static int decode_layoutcommit(struct xdr_stream *xdr, 5295static int decode_layoutcommit(struct xdr_stream *xdr,
5207 struct rpc_rqst *req, 5296 struct rpc_rqst *req,
5208 struct nfs4_layoutcommit_res *res) 5297 struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
6320} 6409}
6321 6410
6322/* 6411/*
6412 * Decode LAYOUTRETURN response
6413 */
6414static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
6415 struct xdr_stream *xdr,
6416 struct nfs4_layoutreturn_res *res)
6417{
6418 struct compound_hdr hdr;
6419 int status;
6420
6421 status = decode_compound_hdr(xdr, &hdr);
6422 if (status)
6423 goto out;
6424 status = decode_sequence(xdr, &res->seq_res, rqstp);
6425 if (status)
6426 goto out;
6427 status = decode_putfh(xdr);
6428 if (status)
6429 goto out;
6430 status = decode_layoutreturn(xdr, res);
6431out:
6432 return status;
6433}
6434
6435/*
6323 * Decode LAYOUTCOMMIT response 6436 * Decode LAYOUTCOMMIT response
6324 */ 6437 */
6325static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, 6438static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6547 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6660 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6548 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6661 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6549 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 6662 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6663 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
6550#endif /* CONFIG_NFS_V4_1 */ 6664#endif /* CONFIG_NFS_V4_1 */
6551}; 6665};
6552 6666
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf2..c4744e1d513c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
87#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
88 88
89/* Default NFSROOT mount options. */ 89/* Default NFSROOT mount options. */
90#define NFS_DEF_OPTIONS "udp" 90#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
91 91
92/* Parameters passed from the kernel command line */ 92/* Parameters passed from the kernel command line */
93static char nfs_root_parms[256] __initdata = ""; 93static char nfs_root_parms[256] __initdata = "";
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Objects Layout Driver kernel module
3#
4objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
5obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..9cf208df1f25
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
1/*
2 * pNFS Objects layout implementation over open-osd initiator library
3 *
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/module.h>
41#include <scsi/osd_initiator.h>
42
43#include "objlayout.h"
44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od;
56};
57
58static void
59objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62
63 dprintk("%s: free od=%p\n", __func__, de->od);
64 osduld_put_device(de->od);
65 kfree(de);
66}
67
68static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
69 const struct nfs4_deviceid *d_id)
70{
71 struct nfs4_deviceid_node *d;
72 struct objio_dev_ent *de;
73
74 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
75 if (!d)
76 return NULL;
77
78 de = container_of(d, struct objio_dev_ent, id_node);
79 return de;
80}
81
82static struct objio_dev_ent *
83_dev_list_add(const struct nfs_server *nfss,
84 const struct nfs4_deviceid *d_id, struct osd_dev *od,
85 gfp_t gfp_flags)
86{
87 struct nfs4_deviceid_node *d;
88 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
89 struct objio_dev_ent *n;
90
91 if (!de) {
92 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
93 return NULL;
94 }
95
96 dprintk("%s: Adding od=%p\n", __func__, od);
97 nfs4_init_deviceid_node(&de->id_node,
98 nfss->pnfs_curr_ld,
99 nfss->nfs_client,
100 d_id);
101 de->od = od;
102
103 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
107 objio_free_deviceid_node(&de->id_node);
108 de = n;
109 }
110
111 atomic_inc(&de->id_node.ref);
112 return de;
113}
114
115struct caps_buffers {
116 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
117 u8 creds[OSD_CAP_LEN];
118};
119
120struct objio_segment {
121 struct pnfs_layout_segment lseg;
122
123 struct pnfs_osd_object_cred *comps;
124
125 unsigned mirrors_p1;
126 unsigned stripe_unit;
127 unsigned group_width; /* Data stripe_units without integrity comps */
128 u64 group_depth;
129 unsigned group_count;
130
131 unsigned max_io_size;
132
133 unsigned comps_index;
134 unsigned num_comps;
135 /* variable length */
136 struct objio_dev_ent *ods[];
137};
138
139static inline struct objio_segment *
140OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141{
142 return container_of(lseg, struct objio_segment, lseg);
143}
144
145struct objio_state;
146typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
147
148struct objio_state {
149 /* Generic layer */
150 struct objlayout_io_state ol_state;
151
152 struct objio_segment *layout;
153
154 struct kref kref;
155 objio_done_fn done;
156 void *private;
157
158 unsigned long length;
159 unsigned numdevs; /* Actually used devs in this IO */
160 /* A per-device variable array of size numdevs */
161 struct _objio_per_comp {
162 struct bio *bio;
163 struct osd_request *or;
164 unsigned long length;
165 u64 offset;
166 unsigned dev;
167 } per_dev[];
168};
169
170/* Send and wait for a get_device_info of devices in the layout,
171 then look them up with the osd_initiator library */
172static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
173 struct objio_segment *objio_seg, unsigned comp,
174 gfp_t gfp_flags)
175{
176 struct pnfs_osd_deviceaddr *deviceaddr;
177 struct nfs4_deviceid *d_id;
178 struct objio_dev_ent *ode;
179 struct osd_dev *od;
180 struct osd_dev_info odi;
181 int err;
182
183 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
184
185 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
186 if (ode)
187 return ode;
188
189 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
190 if (unlikely(err)) {
191 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
192 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
193 return ERR_PTR(err);
194 }
195
196 odi.systemid_len = deviceaddr->oda_systemid.len;
197 if (odi.systemid_len > sizeof(odi.systemid)) {
198 err = -EINVAL;
199 goto out;
200 } else if (odi.systemid_len)
201 memcpy(odi.systemid, deviceaddr->oda_systemid.data,
202 odi.systemid_len);
203 odi.osdname_len = deviceaddr->oda_osdname.len;
204 odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
205
206 if (!odi.osdname_len && !odi.systemid_len) {
207 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
208 __func__);
209 err = -ENODEV;
210 goto out;
211 }
212
213 od = osduld_info_lookup(&odi);
214 if (unlikely(IS_ERR(od))) {
215 err = PTR_ERR(od);
216 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
217 goto out;
218 }
219
220 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
221 gfp_flags);
222
223out:
224 dprintk("%s: return=%d\n", __func__, err);
225 objlayout_put_deviceinfo(deviceaddr);
226 return err ? ERR_PTR(err) : ode;
227}
228
229static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
230 struct objio_segment *objio_seg,
231 gfp_t gfp_flags)
232{
233 unsigned i;
234 int err;
235
236 /* lookup all devices */
237 for (i = 0; i < objio_seg->num_comps; i++) {
238 struct objio_dev_ent *ode;
239
240 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
241 if (unlikely(IS_ERR(ode))) {
242 err = PTR_ERR(ode);
243 goto out;
244 }
245 objio_seg->ods[i] = ode;
246 }
247 err = 0;
248
249out:
250 dprintk("%s: return=%d\n", __func__, err);
251 return err;
252}
253
254static int _verify_data_map(struct pnfs_osd_layout *layout)
255{
256 struct pnfs_osd_data_map *data_map = &layout->olo_map;
257 u64 stripe_length;
258 u32 group_width;
259
260/* FIXME: Only raid0 for now. if not go through MDS */
261 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
262 printk(KERN_ERR "Only RAID_0 for now\n");
263 return -ENOTSUPP;
264 }
265 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
266 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
267 data_map->odm_num_comps, data_map->odm_mirror_cnt);
268 return -EINVAL;
269 }
270
271 if (data_map->odm_group_width)
272 group_width = data_map->odm_group_width;
273 else
274 group_width = data_map->odm_num_comps /
275 (data_map->odm_mirror_cnt + 1);
276
277 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
278 if (stripe_length >= (1ULL << 32)) {
279 printk(KERN_ERR "Total Stripe length(0x%llx)"
280 " >= 32bit is not supported\n", _LLU(stripe_length));
281 return -ENOTSUPP;
282 }
283
284 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
285 printk(KERN_ERR "Stripe Unit(0x%llx)"
286 " must be Multples of PAGE_SIZE(0x%lx)\n",
287 _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
288 return -ENOTSUPP;
289 }
290
291 return 0;
292}
293
294static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
295 struct pnfs_osd_object_cred *src_comp,
296 struct caps_buffers *caps_p)
297{
298 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
299 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
300
301 *cur_comp = *src_comp;
302
303 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
304 sizeof(caps_p->caps_key));
305 cur_comp->oc_cap_key.cred = caps_p->caps_key;
306
307 memcpy(caps_p->creds, src_comp->oc_cap.cred,
308 sizeof(caps_p->creds));
309 cur_comp->oc_cap.cred = caps_p->creds;
310}
311
312int objio_alloc_lseg(struct pnfs_layout_segment **outp,
313 struct pnfs_layout_hdr *pnfslay,
314 struct pnfs_layout_range *range,
315 struct xdr_stream *xdr,
316 gfp_t gfp_flags)
317{
318 struct objio_segment *objio_seg;
319 struct pnfs_osd_xdr_decode_layout_iter iter;
320 struct pnfs_osd_layout layout;
321 struct pnfs_osd_object_cred *cur_comp, src_comp;
322 struct caps_buffers *caps_p;
323 int err;
324
325 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
326 if (unlikely(err))
327 return err;
328
329 err = _verify_data_map(&layout);
330 if (unlikely(err))
331 return err;
332
333 objio_seg = kzalloc(sizeof(*objio_seg) +
334 sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
335 sizeof(*objio_seg->comps) * layout.olo_num_comps +
336 sizeof(struct caps_buffers) * layout.olo_num_comps,
337 gfp_flags);
338 if (!objio_seg)
339 return -ENOMEM;
340
341 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
342 cur_comp = objio_seg->comps;
343 caps_p = (void *)(cur_comp + layout.olo_num_comps);
344 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
345 copy_single_comp(cur_comp++, &src_comp, caps_p++);
346 if (unlikely(err))
347 goto err;
348
349 objio_seg->num_comps = layout.olo_num_comps;
350 objio_seg->comps_index = layout.olo_comps_index;
351 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
352 if (err)
353 goto err;
354
355 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
356 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
357 if (layout.olo_map.odm_group_width) {
358 objio_seg->group_width = layout.olo_map.odm_group_width;
359 objio_seg->group_depth = layout.olo_map.odm_group_depth;
360 objio_seg->group_count = layout.olo_map.odm_num_comps /
361 objio_seg->mirrors_p1 /
362 objio_seg->group_width;
363 } else {
364 objio_seg->group_width = layout.olo_map.odm_num_comps /
365 objio_seg->mirrors_p1;
366 objio_seg->group_depth = -1;
367 objio_seg->group_count = 1;
368 }
369
370 /* Cache this calculation it will hit for every page */
371 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
372 objio_seg->stripe_unit) *
373 objio_seg->group_width;
374
375 *outp = &objio_seg->lseg;
376 return 0;
377
378err:
379 kfree(objio_seg);
380 dprintk("%s: Error: return %d\n", __func__, err);
381 *outp = NULL;
382 return err;
383}
384
385void objio_free_lseg(struct pnfs_layout_segment *lseg)
386{
387 int i;
388 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
389
390 for (i = 0; i < objio_seg->num_comps; i++) {
391 if (!objio_seg->ods[i])
392 break;
393 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
394 }
395 kfree(objio_seg);
396}
397
398int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
399 struct objlayout_io_state **outp,
400 gfp_t gfp_flags)
401{
402 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
403 struct objio_state *ios;
404 const unsigned first_size = sizeof(*ios) +
405 objio_seg->num_comps * sizeof(ios->per_dev[0]);
406 const unsigned sec_size = objio_seg->num_comps *
407 sizeof(ios->ol_state.ioerrs[0]);
408
409 ios = kzalloc(first_size + sec_size, gfp_flags);
410 if (unlikely(!ios))
411 return -ENOMEM;
412
413 ios->layout = objio_seg;
414 ios->ol_state.ioerrs = ((void *)ios) + first_size;
415 ios->ol_state.num_comps = objio_seg->num_comps;
416
417 *outp = &ios->ol_state;
418 return 0;
419}
420
421void objio_free_io_state(struct objlayout_io_state *ol_state)
422{
423 struct objio_state *ios = container_of(ol_state, struct objio_state,
424 ol_state);
425
426 kfree(ios);
427}
428
429enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
430{
431 switch (oep) {
432 case OSD_ERR_PRI_NO_ERROR:
433 return (enum pnfs_osd_errno)0;
434
435 case OSD_ERR_PRI_CLEAR_PAGES:
436 BUG_ON(1);
437 return 0;
438
439 case OSD_ERR_PRI_RESOURCE:
440 return PNFS_OSD_ERR_RESOURCE;
441 case OSD_ERR_PRI_BAD_CRED:
442 return PNFS_OSD_ERR_BAD_CRED;
443 case OSD_ERR_PRI_NO_ACCESS:
444 return PNFS_OSD_ERR_NO_ACCESS;
445 case OSD_ERR_PRI_UNREACHABLE:
446 return PNFS_OSD_ERR_UNREACHABLE;
447 case OSD_ERR_PRI_NOT_FOUND:
448 return PNFS_OSD_ERR_NOT_FOUND;
449 case OSD_ERR_PRI_NO_SPACE:
450 return PNFS_OSD_ERR_NO_SPACE;
451 default:
452 WARN_ON(1);
453 /* fallthrough */
454 case OSD_ERR_PRI_EIO:
455 return PNFS_OSD_ERR_EIO;
456 }
457}
458
459static void _clear_bio(struct bio *bio)
460{
461 struct bio_vec *bv;
462 unsigned i;
463
464 __bio_for_each_segment(bv, bio, i, 0) {
465 unsigned this_count = bv->bv_len;
466
467 if (likely(PAGE_SIZE == this_count))
468 clear_highpage(bv->bv_page);
469 else
470 zero_user(bv->bv_page, bv->bv_offset, this_count);
471 }
472}
473
474static int _io_check(struct objio_state *ios, bool is_write)
475{
476 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
477 int lin_ret = 0;
478 int i;
479
480 for (i = 0; i < ios->numdevs; i++) {
481 struct osd_sense_info osi;
482 struct osd_request *or = ios->per_dev[i].or;
483 unsigned dev;
484 int ret;
485
486 if (!or)
487 continue;
488
489 ret = osd_req_decode_sense(or, &osi);
490 if (likely(!ret))
491 continue;
492
493 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
494 /* start read offset passed endof file */
495 BUG_ON(is_write);
496 _clear_bio(ios->per_dev[i].bio);
497 dprintk("%s: start read offset passed end of file "
498 "offset=0x%llx, length=0x%lx\n", __func__,
499 _LLU(ios->per_dev[i].offset),
500 ios->per_dev[i].length);
501
502 continue; /* we recovered */
503 }
504 dev = ios->per_dev[i].dev;
505 objlayout_io_set_result(&ios->ol_state, dev,
506 &ios->layout->comps[dev].oc_object_id,
507 osd_pri_2_pnfs_err(osi.osd_err_pri),
508 ios->per_dev[i].offset,
509 ios->per_dev[i].length,
510 is_write);
511
512 if (osi.osd_err_pri >= oep) {
513 oep = osi.osd_err_pri;
514 lin_ret = ret;
515 }
516 }
517
518 return lin_ret;
519}
520
521/*
522 * Common IO state helpers.
523 */
524static void _io_free(struct objio_state *ios)
525{
526 unsigned i;
527
528 for (i = 0; i < ios->numdevs; i++) {
529 struct _objio_per_comp *per_dev = &ios->per_dev[i];
530
531 if (per_dev->or) {
532 osd_end_request(per_dev->or);
533 per_dev->or = NULL;
534 }
535
536 if (per_dev->bio) {
537 bio_put(per_dev->bio);
538 per_dev->bio = NULL;
539 }
540 }
541}
542
543struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
544{
545 unsigned min_dev = ios->layout->comps_index;
546 unsigned max_dev = min_dev + ios->layout->num_comps;
547
548 BUG_ON(dev < min_dev || max_dev <= dev);
549 return ios->layout->ods[dev - min_dev]->od;
550}
551
552struct _striping_info {
553 u64 obj_offset;
554 u64 group_length;
555 unsigned dev;
556 unsigned unit_off;
557};
558
559static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
560 struct _striping_info *si)
561{
562 u32 stripe_unit = ios->layout->stripe_unit;
563 u32 group_width = ios->layout->group_width;
564 u64 group_depth = ios->layout->group_depth;
565 u32 U = stripe_unit * group_width;
566
567 u64 T = U * group_depth;
568 u64 S = T * ios->layout->group_count;
569 u64 M = div64_u64(file_offset, S);
570
571 /*
572 G = (L - (M * S)) / T
573 H = (L - (M * S)) % T
574 */
575 u64 LmodU = file_offset - M * S;
576 u32 G = div64_u64(LmodU, T);
577 u64 H = LmodU - G * T;
578
579 u32 N = div_u64(H, U);
580
581 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
582 si->obj_offset = si->unit_off + (N * stripe_unit) +
583 (M * group_depth * stripe_unit);
584
585 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
586 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
587 si->dev *= ios->layout->mirrors_p1;
588
589 si->group_length = T - H;
590}
591
592static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
593 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
594 gfp_t gfp_flags)
595{
596 unsigned pg = *cur_pg;
597 struct request_queue *q =
598 osd_request_queue(_io_od(ios, per_dev->dev));
599
600 per_dev->length += cur_len;
601
602 if (per_dev->bio == NULL) {
603 unsigned stripes = ios->layout->num_comps /
604 ios->layout->mirrors_p1;
605 unsigned pages_in_stripe = stripes *
606 (ios->layout->stripe_unit / PAGE_SIZE);
607 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
608 stripes;
609
610 if (BIO_MAX_PAGES_KMALLOC < bio_size)
611 bio_size = BIO_MAX_PAGES_KMALLOC;
612
613 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
614 if (unlikely(!per_dev->bio)) {
615 dprintk("Faild to allocate BIO size=%u\n", bio_size);
616 return -ENOMEM;
617 }
618 }
619
620 while (cur_len > 0) {
621 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
622 unsigned added_len;
623
624 BUG_ON(ios->ol_state.nr_pages <= pg);
625 cur_len -= pglen;
626
627 added_len = bio_add_pc_page(q, per_dev->bio,
628 ios->ol_state.pages[pg], pglen, pgbase);
629 if (unlikely(pglen != added_len))
630 return -ENOMEM;
631 pgbase = 0;
632 ++pg;
633 }
634 BUG_ON(cur_len);
635
636 *cur_pg = pg;
637 return 0;
638}
639
640static int _prepare_one_group(struct objio_state *ios, u64 length,
641 struct _striping_info *si, unsigned *last_pg,
642 gfp_t gfp_flags)
643{
644 unsigned stripe_unit = ios->layout->stripe_unit;
645 unsigned mirrors_p1 = ios->layout->mirrors_p1;
646 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
647 unsigned dev = si->dev;
648 unsigned first_dev = dev - (dev % devs_in_group);
649 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
650 unsigned cur_pg = *last_pg;
651 int ret = 0;
652
653 while (length) {
654 struct _objio_per_comp *per_dev = &ios->per_dev[dev];
655 unsigned cur_len, page_off = 0;
656
657 if (!per_dev->length) {
658 per_dev->dev = dev;
659 if (dev < si->dev) {
660 per_dev->offset = si->obj_offset + stripe_unit -
661 si->unit_off;
662 cur_len = stripe_unit;
663 } else if (dev == si->dev) {
664 per_dev->offset = si->obj_offset;
665 cur_len = stripe_unit - si->unit_off;
666 page_off = si->unit_off & ~PAGE_MASK;
667 BUG_ON(page_off &&
668 (page_off != ios->ol_state.pgbase));
669 } else { /* dev > si->dev */
670 per_dev->offset = si->obj_offset - si->unit_off;
671 cur_len = stripe_unit;
672 }
673
674 if (max_comp < dev)
675 max_comp = dev;
676 } else {
677 cur_len = stripe_unit;
678 }
679 if (cur_len >= length)
680 cur_len = length;
681
682 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
683 cur_len, gfp_flags);
684 if (unlikely(ret))
685 goto out;
686
687 dev += mirrors_p1;
688 dev = (dev % devs_in_group) + first_dev;
689
690 length -= cur_len;
691 ios->length += cur_len;
692 }
693out:
694 ios->numdevs = max_comp + mirrors_p1;
695 *last_pg = cur_pg;
696 return ret;
697}
698
699static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
700{
701 u64 length = ios->ol_state.count;
702 u64 offset = ios->ol_state.offset;
703 struct _striping_info si;
704 unsigned last_pg = 0;
705 int ret = 0;
706
707 while (length) {
708 _calc_stripe_info(ios, offset, &si);
709
710 if (length < si.group_length)
711 si.group_length = length;
712
713 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
714 if (unlikely(ret))
715 goto out;
716
717 offset += si.group_length;
718 length -= si.group_length;
719 }
720
721out:
722 if (!ios->length)
723 return ret;
724
725 return 0;
726}
727
728static ssize_t _sync_done(struct objio_state *ios)
729{
730 struct completion *waiting = ios->private;
731
732 complete(waiting);
733 return 0;
734}
735
736static void _last_io(struct kref *kref)
737{
738 struct objio_state *ios = container_of(kref, struct objio_state, kref);
739
740 ios->done(ios);
741}
742
743static void _done_io(struct osd_request *or, void *p)
744{
745 struct objio_state *ios = p;
746
747 kref_put(&ios->kref, _last_io);
748}
749
750static ssize_t _io_exec(struct objio_state *ios)
751{
752 DECLARE_COMPLETION_ONSTACK(wait);
753 ssize_t status = 0; /* sync status */
754 unsigned i;
755 objio_done_fn saved_done_fn = ios->done;
756 bool sync = ios->ol_state.sync;
757
758 if (sync) {
759 ios->done = _sync_done;
760 ios->private = &wait;
761 }
762
763 kref_init(&ios->kref);
764
765 for (i = 0; i < ios->numdevs; i++) {
766 struct osd_request *or = ios->per_dev[i].or;
767
768 if (!or)
769 continue;
770
771 kref_get(&ios->kref);
772 osd_execute_request_async(or, _done_io, ios);
773 }
774
775 kref_put(&ios->kref, _last_io);
776
777 if (sync) {
778 wait_for_completion(&wait);
779 status = saved_done_fn(ios);
780 }
781
782 return status;
783}
784
785/*
786 * read
787 */
788static ssize_t _read_done(struct objio_state *ios)
789{
790 ssize_t status;
791 int ret = _io_check(ios, false);
792
793 _io_free(ios);
794
795 if (likely(!ret))
796 status = ios->length;
797 else
798 status = ret;
799
800 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
801 return status;
802}
803
804static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
805{
806 struct osd_request *or = NULL;
807 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
808 unsigned dev = per_dev->dev;
809 struct pnfs_osd_object_cred *cred =
810 &ios->layout->comps[dev];
811 struct osd_obj_id obj = {
812 .partition = cred->oc_object_id.oid_partition_id,
813 .id = cred->oc_object_id.oid_object_id,
814 };
815 int ret;
816
817 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
818 if (unlikely(!or)) {
819 ret = -ENOMEM;
820 goto err;
821 }
822 per_dev->or = or;
823
824 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
825
826 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
827 if (ret) {
828 dprintk("%s: Faild to osd_finalize_request() => %d\n",
829 __func__, ret);
830 goto err;
831 }
832
833 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
834 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
835 per_dev->length);
836
837err:
838 return ret;
839}
840
841static ssize_t _read_exec(struct objio_state *ios)
842{
843 unsigned i;
844 int ret;
845
846 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
847 if (!ios->per_dev[i].length)
848 continue;
849 ret = _read_mirrors(ios, i);
850 if (unlikely(ret))
851 goto err;
852 }
853
854 ios->done = _read_done;
855 return _io_exec(ios); /* In sync mode exec returns the io status */
856
857err:
858 _io_free(ios);
859 return ret;
860}
861
862ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
863{
864 struct objio_state *ios = container_of(ol_state, struct objio_state,
865 ol_state);
866 int ret;
867
868 ret = _io_rw_pagelist(ios, GFP_KERNEL);
869 if (unlikely(ret))
870 return ret;
871
872 return _read_exec(ios);
873}
874
875/*
876 * write
877 */
878static ssize_t _write_done(struct objio_state *ios)
879{
880 ssize_t status;
881 int ret = _io_check(ios, true);
882
883 _io_free(ios);
884
885 if (likely(!ret)) {
886 /* FIXME: should be based on the OSD's persistence model
887 * See OSD2r05 Section 4.13 Data persistence model */
888 ios->ol_state.committed = NFS_FILE_SYNC;
889 status = ios->length;
890 } else {
891 status = ret;
892 }
893
894 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
895 return status;
896}
897
898static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
899{
900 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
901 unsigned dev = ios->per_dev[cur_comp].dev;
902 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
903 int ret;
904
905 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
906 struct osd_request *or = NULL;
907 struct pnfs_osd_object_cred *cred =
908 &ios->layout->comps[dev];
909 struct osd_obj_id obj = {
910 .partition = cred->oc_object_id.oid_partition_id,
911 .id = cred->oc_object_id.oid_object_id,
912 };
913 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
914 struct bio *bio;
915
916 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
917 if (unlikely(!or)) {
918 ret = -ENOMEM;
919 goto err;
920 }
921 per_dev->or = or;
922
923 if (per_dev != master_dev) {
924 bio = bio_kmalloc(GFP_NOFS,
925 master_dev->bio->bi_max_vecs);
926 if (unlikely(!bio)) {
927 dprintk("Faild to allocate BIO size=%u\n",
928 master_dev->bio->bi_max_vecs);
929 ret = -ENOMEM;
930 goto err;
931 }
932
933 __bio_clone(bio, master_dev->bio);
934 bio->bi_bdev = NULL;
935 bio->bi_next = NULL;
936 per_dev->bio = bio;
937 per_dev->dev = dev;
938 per_dev->length = master_dev->length;
939 per_dev->offset = master_dev->offset;
940 } else {
941 bio = master_dev->bio;
942 bio->bi_rw |= REQ_WRITE;
943 }
944
945 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
946
947 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
948 if (ret) {
949 dprintk("%s: Faild to osd_finalize_request() => %d\n",
950 __func__, ret);
951 goto err;
952 }
953
954 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
955 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
956 per_dev->length);
957 }
958
959err:
960 return ret;
961}
962
963static ssize_t _write_exec(struct objio_state *ios)
964{
965 unsigned i;
966 int ret;
967
968 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
969 if (!ios->per_dev[i].length)
970 continue;
971 ret = _write_mirrors(ios, i);
972 if (unlikely(ret))
973 goto err;
974 }
975
976 ios->done = _write_done;
977 return _io_exec(ios); /* In sync mode exec returns the io->status */
978
979err:
980 _io_free(ios);
981 return ret;
982}
983
984ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
985{
986 struct objio_state *ios = container_of(ol_state, struct objio_state,
987 ol_state);
988 int ret;
989
990 /* TODO: ios->stable = stable; */
991 ret = _io_rw_pagelist(ios, GFP_NOFS);
992 if (unlikely(ret))
993 return ret;
994
995 return _write_exec(ios);
996}
997
998static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
999 struct nfs_page *prev, struct nfs_page *req)
1000{
1001 if (!pnfs_generic_pg_test(pgio, prev, req))
1002 return false;
1003
1004 return pgio->pg_count + req->wb_bytes <=
1005 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1006}
1007
1008static struct pnfs_layoutdriver_type objlayout_type = {
1009 .id = LAYOUT_OSD2_OBJECTS,
1010 .name = "LAYOUT_OSD2_OBJECTS",
1011 .flags = PNFS_LAYOUTRET_ON_SETATTR,
1012
1013 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
1014 .free_layout_hdr = objlayout_free_layout_hdr,
1015
1016 .alloc_lseg = objlayout_alloc_lseg,
1017 .free_lseg = objlayout_free_lseg,
1018
1019 .read_pagelist = objlayout_read_pagelist,
1020 .write_pagelist = objlayout_write_pagelist,
1021 .pg_test = objio_pg_test,
1022
1023 .free_deviceid_node = objio_free_deviceid_node,
1024
1025 .encode_layoutcommit = objlayout_encode_layoutcommit,
1026 .encode_layoutreturn = objlayout_encode_layoutreturn,
1027};
1028
1029MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
1030MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
1031MODULE_LICENSE("GPL");
1032
1033static int __init
1034objlayout_init(void)
1035{
1036 int ret = pnfs_register_layoutdriver(&objlayout_type);
1037
1038 if (ret)
1039 printk(KERN_INFO
1040 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
1041 __func__, ret);
1042 else
1043 printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
1044 __func__);
1045 return ret;
1046}
1047
1048static void __exit
1049objlayout_exit(void)
1050{
1051 pnfs_unregister_layoutdriver(&objlayout_type);
1052 printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
1053 __func__);
1054}
1055
1056module_init(objlayout_init);
1057module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..dc3956c0de80
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
1/*
2 * pNFS Objects layout driver high level definitions
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <scsi/osd_initiator.h>
41#include "objlayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44/*
45 * Create a objlayout layout structure for the given inode and return it.
46 */
47struct pnfs_layout_hdr *
48objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
49{
50 struct objlayout *objlay;
51
52 objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
53 if (objlay) {
54 spin_lock_init(&objlay->lock);
55 INIT_LIST_HEAD(&objlay->err_list);
56 }
57 dprintk("%s: Return %p\n", __func__, objlay);
58 return &objlay->pnfs_layout;
59}
60
61/*
62 * Free an objlayout layout structure
63 */
64void
65objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
66{
67 struct objlayout *objlay = OBJLAYOUT(lo);
68
69 dprintk("%s: objlay %p\n", __func__, objlay);
70
71 WARN_ON(!list_empty(&objlay->err_list));
72 kfree(objlay);
73}
74
75/*
76 * Unmarshall layout and store it in pnfslay.
77 */
78struct pnfs_layout_segment *
79objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
80 struct nfs4_layoutget_res *lgr,
81 gfp_t gfp_flags)
82{
83 int status = -ENOMEM;
84 struct xdr_stream stream;
85 struct xdr_buf buf = {
86 .pages = lgr->layoutp->pages,
87 .page_len = lgr->layoutp->len,
88 .buflen = lgr->layoutp->len,
89 .len = lgr->layoutp->len,
90 };
91 struct page *scratch;
92 struct pnfs_layout_segment *lseg;
93
94 dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
95
96 scratch = alloc_page(gfp_flags);
97 if (!scratch)
98 goto err_nofree;
99
100 xdr_init_decode(&stream, &buf, NULL);
101 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
102
103 status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
104 if (unlikely(status)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
106 status);
107 goto err;
108 }
109
110 __free_page(scratch);
111
112 dprintk("%s: Return %p\n", __func__, lseg);
113 return lseg;
114
115err:
116 __free_page(scratch);
117err_nofree:
118 dprintk("%s: Err Return=>%d\n", __func__, status);
119 return ERR_PTR(status);
120}
121
122/*
123 * Free a layout segement
124 */
125void
126objlayout_free_lseg(struct pnfs_layout_segment *lseg)
127{
128 dprintk("%s: freeing layout segment %p\n", __func__, lseg);
129
130 if (unlikely(!lseg))
131 return;
132
133 objio_free_lseg(lseg);
134}
135
136/*
137 * I/O Operations
138 */
139static inline u64
140end_offset(u64 start, u64 len)
141{
142 u64 end;
143
144 end = start + len;
145 return end >= start ? end : NFS4_MAX_UINT64;
146}
147
148/* last octet in a range */
149static inline u64
150last_byte_offset(u64 start, u64 len)
151{
152 u64 end;
153
154 BUG_ON(!len);
155 end = start + len;
156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157}
158
159static struct objlayout_io_state *
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
161 struct page **pages,
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset;
171
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) {
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185
186 if (pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT;
188 pgbase &= ~PAGE_MASK;
189 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212}
213
214/*
215 * I/O done common code
216 */
217static void
218objlayout_iodone(struct objlayout_io_state *state)
219{
220 dprintk("%s: state %p status\n", __func__, state);
221
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
226
227 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list);
230 spin_unlock(&objlay->lock);
231 }
232}
233
234/*
235 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
236 *
237 * The @index component IO failed (error returned from target). Register
238 * the error for later reporting at layout-return.
239 */
240void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write)
244{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
246
247 BUG_ON(index >= state->num_comps);
248 if (osd_error) {
249 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset;
251 ioerr->oer_comp_length = length;
252 ioerr->oer_iswrite = is_write;
253 ioerr->oer_errno = osd_error;
254
255 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
256 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
257 __func__, index, ioerr->oer_errno,
258 ioerr->oer_iswrite,
259 _DEVID_LO(&ioerr->oer_component.oid_device_id),
260 _DEVID_HI(&ioerr->oer_component.oid_device_id),
261 ioerr->oer_component.oid_partition_id,
262 ioerr->oer_component.oid_object_id,
263 ioerr->oer_comp_offset,
264 ioerr->oer_comp_length);
265 } else {
266 /* User need not call if no error is reported */
267 ioerr->oer_errno = 0;
268 }
269}
270
271/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
272 * This is because the osd completion is called with ints-off from
273 * the block layer
274 */
275static void _rpc_read_complete(struct work_struct *work)
276{
277 struct rpc_task *task;
278 struct nfs_read_data *rdata;
279
280 dprintk("%s enter\n", __func__);
281 task = container_of(work, struct rpc_task, u.tk_work);
282 rdata = container_of(task, struct nfs_read_data, task);
283
284 pnfs_ld_read_done(rdata);
285}
286
287void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
289{
290 int eof = state->eof;
291 struct nfs_read_data *rdata;
292
293 state->status = status;
294 dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status;
299 rdata->res.eof = eof;
300 }
301 objlayout_iodone(state);
302 /* must not use state after this point */
303
304 if (sync)
305 pnfs_ld_read_done(rdata);
306 else {
307 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
308 schedule_work(&rdata->task.u.tk_work);
309 }
310}
311
312/*
313 * Perform sync or async reads.
314 */
315enum pnfs_try_status
316objlayout_read_pagelist(struct nfs_read_data *rdata)
317{
318 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count;
320 struct objlayout_io_state *state;
321 ssize_t status = 0;
322 loff_t eof;
323
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) {
330 status = 0;
331 rdata->res.count = 0;
332 rdata->res.eof = 1;
333 goto out;
334 }
335 count = eof - offset;
336 }
337
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
339 rdata->args.pages, rdata->args.pgbase,
340 offset, count,
341 rdata->lseg, rdata,
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347
348 state->eof = state->offset + state->count >= eof;
349
350 status = objio_read_pagelist(state);
351 out:
352 dprintk("%s: Return status %Zd\n", __func__, status);
353 rdata->pnfs_error = status;
354 return PNFS_ATTEMPTED;
355}
356
357/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
358 * This is because the osd completion is called with ints-off from
359 * the block layer
360 */
361static void _rpc_write_complete(struct work_struct *work)
362{
363 struct rpc_task *task;
364 struct nfs_write_data *wdata;
365
366 dprintk("%s enter\n", __func__);
367 task = container_of(work, struct rpc_task, u.tk_work);
368 wdata = container_of(task, struct nfs_write_data, task);
369
370 pnfs_ld_write_done(wdata);
371}
372
373void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
375 bool sync)
376{
377 struct nfs_write_data *wdata;
378
379 dprintk("%s: Begin\n", __func__);
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) {
384 wdata->res.count = status;
385 wdata->verf.committed = state->committed;
386 dprintk("%s: Return status %d committed %d\n",
387 __func__, wdata->task.tk_status,
388 wdata->verf.committed);
389 } else
390 dprintk("%s: Return status %d\n",
391 __func__, wdata->task.tk_status);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394
395 if (sync)
396 pnfs_ld_write_done(wdata);
397 else {
398 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
399 schedule_work(&wdata->task.u.tk_work);
400 }
401}
402
403/*
404 * Perform sync or async writes.
405 */
406enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how)
409{
410 struct objlayout_io_state *state;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427
428 state->sync = how & FLUSH_SYNC;
429
430 status = objio_write_pagelist(state, how & FLUSH_STABLE);
431 out:
432 dprintk("%s: Return status %Zd\n", __func__, status);
433 wdata->pnfs_error = status;
434 return PNFS_ATTEMPTED;
435}
436
437void
438objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
439 struct xdr_stream *xdr,
440 const struct nfs4_layoutcommit_args *args)
441{
442 struct objlayout *objlay = OBJLAYOUT(pnfslay);
443 struct pnfs_osd_layoutupdate lou;
444 __be32 *start;
445
446 dprintk("%s: Begin\n", __func__);
447
448 spin_lock(&objlay->lock);
449 lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
450 lou.dsu_delta = objlay->delta_space_used;
451 objlay->delta_space_used = 0;
452 objlay->delta_space_valid = OBJ_DSU_INIT;
453 lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
454 spin_unlock(&objlay->lock);
455
456 start = xdr_reserve_space(xdr, 4);
457
458 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
459
460 *start = cpu_to_be32((xdr->p - start - 1) * 4);
461
462 dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
463 lou.dsu_delta, lou.olu_ioerr_flag);
464}
465
466static int
467err_prio(u32 oer_errno)
468{
469 switch (oer_errno) {
470 case 0:
471 return 0;
472
473 case PNFS_OSD_ERR_RESOURCE:
474 return OSD_ERR_PRI_RESOURCE;
475 case PNFS_OSD_ERR_BAD_CRED:
476 return OSD_ERR_PRI_BAD_CRED;
477 case PNFS_OSD_ERR_NO_ACCESS:
478 return OSD_ERR_PRI_NO_ACCESS;
479 case PNFS_OSD_ERR_UNREACHABLE:
480 return OSD_ERR_PRI_UNREACHABLE;
481 case PNFS_OSD_ERR_NOT_FOUND:
482 return OSD_ERR_PRI_NOT_FOUND;
483 case PNFS_OSD_ERR_NO_SPACE:
484 return OSD_ERR_PRI_NO_SPACE;
485 default:
486 WARN_ON(1);
487 /* fallthrough */
488 case PNFS_OSD_ERR_EIO:
489 return OSD_ERR_PRI_EIO;
490 }
491}
492
493static void
494merge_ioerr(struct pnfs_osd_ioerr *dest_err,
495 const struct pnfs_osd_ioerr *src_err)
496{
497 u64 dest_end, src_end;
498
499 if (!dest_err->oer_errno) {
500 *dest_err = *src_err;
501 /* accumulated device must be blank */
502 memset(&dest_err->oer_component.oid_device_id, 0,
503 sizeof(dest_err->oer_component.oid_device_id));
504
505 return;
506 }
507
508 if (dest_err->oer_component.oid_partition_id !=
509 src_err->oer_component.oid_partition_id)
510 dest_err->oer_component.oid_partition_id = 0;
511
512 if (dest_err->oer_component.oid_object_id !=
513 src_err->oer_component.oid_object_id)
514 dest_err->oer_component.oid_object_id = 0;
515
516 if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
517 dest_err->oer_comp_offset = src_err->oer_comp_offset;
518
519 dest_end = end_offset(dest_err->oer_comp_offset,
520 dest_err->oer_comp_length);
521 src_end = end_offset(src_err->oer_comp_offset,
522 src_err->oer_comp_length);
523 if (dest_end < src_end)
524 dest_end = src_end;
525
526 dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
527
528 if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
529 (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
530 dest_err->oer_errno = src_err->oer_errno;
531 } else if (src_err->oer_iswrite) {
532 dest_err->oer_iswrite = true;
533 dest_err->oer_errno = src_err->oer_errno;
534 }
535}
536
537static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{
540 struct objlayout_io_state *state, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
544 unsigned i;
545
546 for (i = 0; i < state->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
548
549 if (!ioerr->oer_errno)
550 continue;
551
552 printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
553 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
554 "offset=0x%llx length=0x%llx\n",
555 __func__, i, ioerr->oer_errno,
556 ioerr->oer_iswrite,
557 _DEVID_LO(&ioerr->oer_component.oid_device_id),
558 _DEVID_HI(&ioerr->oer_component.oid_device_id),
559 ioerr->oer_component.oid_partition_id,
560 ioerr->oer_component.oid_object_id,
561 ioerr->oer_comp_offset,
562 ioerr->oer_comp_length);
563
564 merge_ioerr(&accumulated_err, ioerr);
565 }
566 list_del(&state->err_list);
567 objlayout_free_io_state(state);
568 }
569
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
571}
572
573void
574objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
575 struct xdr_stream *xdr,
576 const struct nfs4_layoutreturn_args *args)
577{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp;
580 __be32 *start;
581
582 dprintk("%s: Begin\n", __func__);
583 start = xdr_reserve_space(xdr, 4);
584 BUG_ON(!start);
585
586 spin_lock(&objlay->lock);
587
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p;
590 unsigned i;
591 int res = 0;
592
593 for (i = 0; i < state->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
595
596 if (!ioerr->oer_errno)
597 continue;
598
599 dprintk("%s: err[%d]: errno=%d is_write=%d "
600 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
601 "offset=0x%llx length=0x%llx\n",
602 __func__, i, ioerr->oer_errno,
603 ioerr->oer_iswrite,
604 _DEVID_LO(&ioerr->oer_component.oid_device_id),
605 _DEVID_HI(&ioerr->oer_component.oid_device_id),
606 ioerr->oer_component.oid_partition_id,
607 ioerr->oer_component.oid_object_id,
608 ioerr->oer_comp_offset,
609 ioerr->oer_comp_length);
610
611 p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
612 if (unlikely(!p)) {
613 res = -E2BIG;
614 break; /* accumulated_error */
615 }
616
617 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
619 }
620
621 /* TODO: use xdr_write_pages */
622 if (unlikely(res)) {
623 /* no space for even one error descriptor */
624 BUG_ON(!last_xdr);
625
626 /* we've encountered a situation with lots and lots of
627 * errors and no space to encode them all. Use the last
628 * available slot to report the union of all the
629 * remaining errors.
630 */
631 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done;
633 }
634 list_del(&state->err_list);
635 objlayout_free_io_state(state);
636 }
637loop_done:
638 spin_unlock(&objlay->lock);
639
640 *start = cpu_to_be32((xdr->p - start - 1) * 4);
641 dprintk("%s: Return\n", __func__);
642}
643
644
645/*
646 * Get Device Info API for io engines
647 */
648struct objlayout_deviceinfo {
649 struct page *page;
650 struct pnfs_osd_deviceaddr da; /* This must be last */
651};
652
653/* Initialize and call nfs_getdeviceinfo, then decode and return a
654 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
655 * should be called.
656 */
657int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
658 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
659 gfp_t gfp_flags)
660{
661 struct objlayout_deviceinfo *odi;
662 struct pnfs_device pd;
663 struct super_block *sb;
664 struct page *page, **pages;
665 u32 *p;
666 int err;
667
668 page = alloc_page(gfp_flags);
669 if (!page)
670 return -ENOMEM;
671
672 pages = &page;
673 pd.pages = pages;
674
675 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
676 pd.layout_type = LAYOUT_OSD2_OBJECTS;
677 pd.pages = &page;
678 pd.pgbase = 0;
679 pd.pglen = PAGE_SIZE;
680 pd.mincount = 0;
681
682 sb = pnfslay->plh_inode->i_sb;
683 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
684 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
685 if (err)
686 goto err_out;
687
688 p = page_address(page);
689 odi = kzalloc(sizeof(*odi), gfp_flags);
690 if (!odi) {
691 err = -ENOMEM;
692 goto err_out;
693 }
694 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
695 odi->page = page;
696 *deviceaddr = &odi->da;
697 return 0;
698
699err_out:
700 __free_page(page);
701 return err;
702}
703
704void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
705{
706 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
707 struct objlayout_deviceinfo,
708 da);
709
710 __free_page(odi->page);
711 kfree(odi);
712}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
1/*
2 * Data types and function declerations for interfacing with the
3 * pNFS standard object layout driver.
4 *
5 * Copyright (C) 2007 Panasas Inc. [year of first publication]
6 * All rights reserved.
7 *
8 * Benny Halevy <bhalevy@panasas.com>
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2
13 * See the file COPYING included with this distribution for more details.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 *
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the Panasas company nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
29 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
30 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
36 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
37 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 */
40
41#ifndef _OBJLAYOUT_H
42#define _OBJLAYOUT_H
43
44#include <linux/nfs_fs.h>
45#include <linux/pnfs_osd_xdr.h>
46#include "../pnfs.h"
47
48/*
49 * per-inode layout
50 */
51struct objlayout {
52 struct pnfs_layout_hdr pnfs_layout;
53
54 /* for layout_commit */
55 enum osd_delta_space_valid_enum {
56 OBJ_DSU_INIT = 0,
57 OBJ_DSU_VALID,
58 OBJ_DSU_INVALID,
59 } delta_space_valid;
60 s64 delta_space_used; /* consumed by write ops */
61
62 /* for layout_return */
63 spinlock_t lock;
64 struct list_head err_list;
65};
66
67static inline struct objlayout *
68OBJLAYOUT(struct pnfs_layout_hdr *lo)
69{
70 return container_of(lo, struct objlayout, pnfs_layout);
71}
72
73/*
74 * per-I/O operation state
75 * embedded in objects provider io_state data structure
76 */
77struct objlayout_io_state {
78 struct pnfs_layout_segment *lseg;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86
87 void *rpcdata;
88 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */
91
92 /* Error reporting (layout_return) */
93 struct list_head err_list;
94 unsigned num_comps;
95 /* Pointer to array of error descriptors of size num_comps.
96 * It should contain as many entries as devices in the osd_layout
97 * that participate in the I/O. It is up to the io_engine to allocate
98 * needed space and set num_comps.
99 */
100 struct pnfs_osd_ioerr *ioerrs;
101};
102
103/*
104 * Raid engine I/O API
105 */
106extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
107 struct pnfs_layout_hdr *pnfslay,
108 struct pnfs_layout_range *range,
109 struct xdr_stream *xdr,
110 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112
113extern int objio_alloc_io_state(
114 struct pnfs_layout_segment *lseg,
115 struct objlayout_io_state **outp,
116 gfp_t gfp_flags);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
121 bool stable);
122
123/*
124 * callback API
125 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state,
127 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write);
129
130static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
132{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported.
138 */
139 spin_lock(&objlay->lock);
140 if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
141 objlay->delta_space_valid = OBJ_DSU_VALID;
142 objlay->delta_space_used += space_used;
143 }
144 spin_unlock(&objlay->lock);
145}
146
147extern void objlayout_read_done(struct objlayout_io_state *state,
148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state,
150 ssize_t status, bool sync);
151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/*
158 * exported generic objects function vectors
159 */
160
161extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
162extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
163
164extern struct pnfs_layout_segment *objlayout_alloc_lseg(
165 struct pnfs_layout_hdr *,
166 struct nfs4_layoutget_res *,
167 gfp_t gfp_flags);
168extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169
170extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_read_data *);
172
173extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_write_data *,
175 int how);
176
177extern void objlayout_encode_layoutcommit(
178 struct pnfs_layout_hdr *,
179 struct xdr_stream *,
180 const struct nfs4_layoutcommit_args *);
181
182extern void objlayout_encode_layoutreturn(
183 struct pnfs_layout_hdr *,
184 struct xdr_stream *,
185 const struct nfs4_layoutreturn_args *);
186
187#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
1/*
2 * Object-Based pNFS Layout XDR layer
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/pnfs_osd_xdr.h>
41
42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
43
44/*
45 * The following implementation is based on RFC5664
46 */
47
48/*
49 * struct pnfs_osd_objid {
50 * struct nfs4_deviceid oid_device_id;
51 * u64 oid_partition_id;
52 * u64 oid_object_id;
53 * }; // xdr size 32 bytes
54 */
55static __be32 *
56_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
57{
58 p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
59 sizeof(objid->oid_device_id.data));
60
61 p = xdr_decode_hyper(p, &objid->oid_partition_id);
62 p = xdr_decode_hyper(p, &objid->oid_object_id);
63 return p;
64}
65/*
66 * struct pnfs_osd_opaque_cred {
67 * u32 cred_len;
68 * void *cred;
69 * }; // xdr size [variable]
70 * The return pointers are from the xdr buffer
71 */
72static int
73_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
74 struct xdr_stream *xdr)
75{
76 __be32 *p = xdr_inline_decode(xdr, 1);
77
78 if (!p)
79 return -EINVAL;
80
81 opaque_cred->cred_len = be32_to_cpu(*p++);
82
83 p = xdr_inline_decode(xdr, opaque_cred->cred_len);
84 if (!p)
85 return -EINVAL;
86
87 opaque_cred->cred = p;
88 return 0;
89}
90
91/*
92 * struct pnfs_osd_object_cred {
93 * struct pnfs_osd_objid oc_object_id;
94 * u32 oc_osd_version;
95 * u32 oc_cap_key_sec;
96 * struct pnfs_osd_opaque_cred oc_cap_key
97 * struct pnfs_osd_opaque_cred oc_cap;
98 * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
99 */
100static int
101_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
102 struct xdr_stream *xdr)
103{
104 __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
105 int ret;
106
107 if (!p)
108 return -EIO;
109
110 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
111 comp->oc_osd_version = be32_to_cpup(p++);
112 comp->oc_cap_key_sec = be32_to_cpup(p);
113
114 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
115 if (unlikely(ret))
116 return ret;
117
118 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
119 return ret;
120}
121
122/*
123 * struct pnfs_osd_data_map {
124 * u32 odm_num_comps;
125 * u64 odm_stripe_unit;
126 * u32 odm_group_width;
127 * u32 odm_group_depth;
128 * u32 odm_mirror_cnt;
129 * u32 odm_raid_algorithm;
130 * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
131 */
132static inline int
133_osd_data_map_xdr_sz(void)
134{
135 return 4 + 8 + 4 + 4 + 4 + 4;
136}
137
138static __be32 *
139_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
140{
141 data_map->odm_num_comps = be32_to_cpup(p++);
142 p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
143 data_map->odm_group_width = be32_to_cpup(p++);
144 data_map->odm_group_depth = be32_to_cpup(p++);
145 data_map->odm_mirror_cnt = be32_to_cpup(p++);
146 data_map->odm_raid_algorithm = be32_to_cpup(p++);
147 dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
148 "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
149 __func__,
150 data_map->odm_num_comps,
151 (unsigned long long)data_map->odm_stripe_unit,
152 data_map->odm_group_width,
153 data_map->odm_group_depth,
154 data_map->odm_mirror_cnt,
155 data_map->odm_raid_algorithm);
156 return p;
157}
158
159int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
160 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
161{
162 __be32 *p;
163
164 memset(iter, 0, sizeof(*iter));
165
166 p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
167 if (unlikely(!p))
168 return -EINVAL;
169
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++);
173 iter->total_comps = layout->olo_num_comps;
174 return 0;
175}
176
177bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
178 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
179 int *err)
180{
181 BUG_ON(iter->decoded_comps > iter->total_comps);
182 if (iter->decoded_comps == iter->total_comps)
183 return false;
184
185 *err = _osd_xdr_decode_object_cred(comp, xdr);
186 if (unlikely(*err)) {
187 dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
188 "total_comps=%d\n", __func__, *err,
189 iter->decoded_comps, iter->total_comps);
190 return false; /* stop the loop */
191 }
192 dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
193 "key_len=%u cap_len=%u\n",
194 __func__,
195 _DEVID_LO(&comp->oc_object_id.oid_device_id),
196 _DEVID_HI(&comp->oc_object_id.oid_device_id),
197 comp->oc_object_id.oid_partition_id,
198 comp->oc_object_id.oid_object_id,
199 comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
200
201 iter->decoded_comps++;
202 return true;
203}
204
205/*
206 * Get Device Information Decoding
207 *
208 * Note: since Device Information is currently done synchronously, all
209 * variable strings fields are left inside the rpc buffer and are only
210 * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
211 * should not be freed while the returned information is in use.
212 */
213/*
214 *struct nfs4_string {
215 * unsigned int len;
216 * char *data;
217 *}; // size [variable]
218 * NOTE: Returned string points to inside the XDR buffer
219 */
220static __be32 *
221__read_u8_opaque(__be32 *p, struct nfs4_string *str)
222{
223 str->len = be32_to_cpup(p++);
224 str->data = (char *)p;
225
226 p += XDR_QUADLEN(str->len);
227 return p;
228}
229
230/*
231 * struct pnfs_osd_targetid {
232 * u32 oti_type;
233 * struct nfs4_string oti_scsi_device_id;
234 * };// size 4 + [variable]
235 */
236static __be32 *
237__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
238{
239 u32 oti_type;
240
241 oti_type = be32_to_cpup(p++);
242 targetid->oti_type = oti_type;
243
244 switch (oti_type) {
245 case OBJ_TARGET_SCSI_NAME:
246 case OBJ_TARGET_SCSI_DEVICE_ID:
247 p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
248 }
249
250 return p;
251}
252
253/*
254 * struct pnfs_osd_net_addr {
255 * struct nfs4_string r_netid;
256 * struct nfs4_string r_addr;
257 * };
258 */
259static __be32 *
260__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
261{
262 p = __read_u8_opaque(p, &netaddr->r_netid);
263 p = __read_u8_opaque(p, &netaddr->r_addr);
264
265 return p;
266}
267
268/*
269 * struct pnfs_osd_targetaddr {
270 * u32 ota_available;
271 * struct pnfs_osd_net_addr ota_netaddr;
272 * };
273 */
274static __be32 *
275__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
276{
277 u32 ota_available;
278
279 ota_available = be32_to_cpup(p++);
280 targetaddr->ota_available = ota_available;
281
282 if (ota_available)
283 p = __read_net_addr(p, &targetaddr->ota_netaddr);
284
285
286 return p;
287}
288
289/*
290 * struct pnfs_osd_deviceaddr {
291 * struct pnfs_osd_targetid oda_targetid;
292 * struct pnfs_osd_targetaddr oda_targetaddr;
293 * u8 oda_lun[8];
294 * struct nfs4_string oda_systemid;
295 * struct pnfs_osd_object_cred oda_root_obj_cred;
296 * struct nfs4_string oda_osdname;
297 * };
298 */
299
300/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
301 * not have an xdr_stream
302 */
303static __be32 *
304__read_opaque_cred(__be32 *p,
305 struct pnfs_osd_opaque_cred *opaque_cred)
306{
307 opaque_cred->cred_len = be32_to_cpu(*p++);
308 opaque_cred->cred = p;
309 return p + XDR_QUADLEN(opaque_cred->cred_len);
310}
311
312static __be32 *
313__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
314{
315 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
316 comp->oc_osd_version = be32_to_cpup(p++);
317 comp->oc_cap_key_sec = be32_to_cpup(p++);
318
319 p = __read_opaque_cred(p, &comp->oc_cap_key);
320 p = __read_opaque_cred(p, &comp->oc_cap);
321 return p;
322}
323
324void pnfs_osd_xdr_decode_deviceaddr(
325 struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
326{
327 p = __read_targetid(p, &deviceaddr->oda_targetid);
328
329 p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
330
331 p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
332 sizeof(deviceaddr->oda_lun));
333
334 p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
335
336 p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
337
338 p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
339
340 /* libosd likes this terminated in dbg. It's last, so no problems */
341 deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
342}
343
344/*
345 * struct pnfs_osd_layoutupdate {
346 * u32 dsu_valid;
347 * s64 dsu_delta;
348 * u32 olu_ioerr_flag;
349 * }; xdr size 4 + 8 + 4
350 */
351int
352pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
353 struct pnfs_osd_layoutupdate *lou)
354{
355 __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
356
357 if (!p)
358 return -E2BIG;
359
360 *p++ = cpu_to_be32(lou->dsu_valid);
361 if (lou->dsu_valid)
362 p = xdr_encode_hyper(p, lou->dsu_delta);
363 *p++ = cpu_to_be32(lou->olu_ioerr_flag);
364 return 0;
365}
366
367/*
368 * struct pnfs_osd_objid {
369 * struct nfs4_deviceid oid_device_id;
370 * u64 oid_partition_id;
371 * u64 oid_object_id;
372 * }; // xdr size 32 bytes
373 */
374static inline __be32 *
375pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
376{
377 p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
378 sizeof(object_id->oid_device_id.data));
379 p = xdr_encode_hyper(p, object_id->oid_partition_id);
380 p = xdr_encode_hyper(p, object_id->oid_object_id);
381
382 return p;
383}
384
385/*
386 * struct pnfs_osd_ioerr {
387 * struct pnfs_osd_objid oer_component;
388 * u64 oer_comp_offset;
389 * u64 oer_comp_length;
390 * u32 oer_iswrite;
391 * u32 oer_errno;
392 * }; // xdr size 32 + 24 bytes
393 */
394void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
395{
396 p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
397 p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
398 p = xdr_encode_hyper(p, ioerr->oer_comp_length);
399 *p++ = cpu_to_be32(ioerr->oer_iswrite);
400 *p = cpu_to_be32(ioerr->oer_errno);
401}
402
403__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
404{
405 __be32 *p;
406
407 p = xdr_reserve_space(xdr, 32 + 24);
408 if (unlikely(!p))
409 dprintk("%s: out of xdr space\n", __func__);
410
411 return p;
412}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e2213..7913961aff22 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
204 TASK_UNINTERRUPTIBLE); 204 TASK_UNINTERRUPTIBLE);
205} 205}
206 206
207static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
208{
209 /*
210 * FIXME: ideally we should be able to coalesce all requests
211 * that are not block boundary aligned, but currently this
212 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
213 * since nfs_flush_multi and nfs_pagein_multi assume you
214 * can have only one struct nfs_page.
215 */
216 if (desc->pg_bsize < PAGE_SIZE)
217 return 0;
218
219 return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
220}
221
207/** 222/**
208 * nfs_pageio_init - initialise a page io descriptor 223 * nfs_pageio_init - initialise a page io descriptor
209 * @desc: pointer to descriptor 224 * @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
229 desc->pg_ioflags = io_flags; 244 desc->pg_ioflags = io_flags;
230 desc->pg_error = 0; 245 desc->pg_error = 0;
231 desc->pg_lseg = NULL; 246 desc->pg_lseg = NULL;
247 desc->pg_test = nfs_generic_pg_test;
248 pnfs_pageio_init(desc, inode);
232} 249}
233 250
234/** 251/**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
242 * 259 *
243 * Return 'true' if this is the case, else return 'false'. 260 * Return 'true' if this is the case, else return 'false'.
244 */ 261 */
245static int nfs_can_coalesce_requests(struct nfs_page *prev, 262static bool nfs_can_coalesce_requests(struct nfs_page *prev,
246 struct nfs_page *req, 263 struct nfs_page *req,
247 struct nfs_pageio_descriptor *pgio) 264 struct nfs_pageio_descriptor *pgio)
248{ 265{
249 if (req->wb_context->cred != prev->wb_context->cred) 266 if (req->wb_context->cred != prev->wb_context->cred)
250 return 0; 267 return false;
251 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) 268 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
252 return 0; 269 return false;
253 if (req->wb_context->state != prev->wb_context->state) 270 if (req->wb_context->state != prev->wb_context->state)
254 return 0; 271 return false;
255 if (req->wb_index != (prev->wb_index + 1)) 272 if (req->wb_index != (prev->wb_index + 1))
256 return 0; 273 return false;
257 if (req->wb_pgbase != 0) 274 if (req->wb_pgbase != 0)
258 return 0; 275 return false;
259 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 276 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
260 return 0; 277 return false;
261 /* 278 return pgio->pg_test(pgio, prev, req);
262 * Non-whole file layouts need to check that req is inside of
263 * pgio->pg_lseg.
264 */
265 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
266 return 0;
267 return 1;
268} 279}
269 280
270/** 281/**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
278static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 289static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
279 struct nfs_page *req) 290 struct nfs_page *req)
280{ 291{
281 size_t newlen = req->wb_bytes;
282
283 if (desc->pg_count != 0) { 292 if (desc->pg_count != 0) {
284 struct nfs_page *prev; 293 struct nfs_page *prev;
285 294
286 /*
287 * FIXME: ideally we should be able to coalesce all requests
288 * that are not block boundary aligned, but currently this
289 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
290 * since nfs_flush_multi and nfs_pagein_multi assume you
291 * can have only one struct nfs_page.
292 */
293 if (desc->pg_bsize < PAGE_SIZE)
294 return 0;
295 newlen += desc->pg_count;
296 if (newlen > desc->pg_bsize)
297 return 0;
298 prev = nfs_list_entry(desc->pg_list.prev); 295 prev = nfs_list_entry(desc->pg_list.prev);
299 if (!nfs_can_coalesce_requests(prev, req, desc)) 296 if (!nfs_can_coalesce_requests(prev, req, desc))
300 return 0; 297 return 0;
301 } else 298 } else {
302 desc->pg_base = req->wb_pgbase; 299 desc->pg_base = req->wb_pgbase;
300 }
303 nfs_list_remove_request(req); 301 nfs_list_remove_request(req);
304 nfs_list_add_request(req, &desc->pg_list); 302 nfs_list_add_request(req, &desc->pg_list);
305 desc->pg_count = newlen; 303 desc->pg_count += req->wb_bytes;
306 return 1; 304 return 1;
307} 305}
308 306
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a520..8c1309d852a6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
177 atomic_inc(&lo->plh_refcount); 177 atomic_inc(&lo->plh_refcount);
178} 178}
179 179
180static struct pnfs_layout_hdr *
181pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
182{
183 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
184 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
185 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
186}
187
188static void
189pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
190{
191 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
192 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
193}
194
180static void 195static void
181destroy_layout_hdr(struct pnfs_layout_hdr *lo) 196destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 197{
183 dprintk("%s: freeing layout cache %p\n", __func__, lo); 198 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 BUG_ON(!list_empty(&lo->plh_layouts)); 199 BUG_ON(!list_empty(&lo->plh_layouts));
185 NFS_I(lo->plh_inode)->layout = NULL; 200 NFS_I(lo->plh_inode)->layout = NULL;
186 kfree(lo); 201 pnfs_free_layout_hdr(lo);
187} 202}
188 203
189static void 204static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
228{ 243{
229 struct inode *inode = lseg->pls_layout->plh_inode; 244 struct inode *inode = lseg->pls_layout->plh_inode;
230 245
231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 246 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
232 list_del_init(&lseg->pls_list); 247 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) { 248 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 249 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
261} 276}
262EXPORT_SYMBOL_GPL(put_lseg); 277EXPORT_SYMBOL_GPL(put_lseg);
263 278
279static inline u64
280end_offset(u64 start, u64 len)
281{
282 u64 end;
283
284 end = start + len;
285 return end >= start ? end : NFS4_MAX_UINT64;
286}
287
288/* last octet in a range */
289static inline u64
290last_byte_offset(u64 start, u64 len)
291{
292 u64 end;
293
294 BUG_ON(!len);
295 end = start + len;
296 return end > start ? end - 1 : NFS4_MAX_UINT64;
297}
298
299/*
300 * is l2 fully contained in l1?
301 * start1 end1
302 * [----------------------------------)
303 * start2 end2
304 * [----------------)
305 */
306static inline int
307lo_seg_contained(struct pnfs_layout_range *l1,
308 struct pnfs_layout_range *l2)
309{
310 u64 start1 = l1->offset;
311 u64 end1 = end_offset(start1, l1->length);
312 u64 start2 = l2->offset;
313 u64 end2 = end_offset(start2, l2->length);
314
315 return (start1 <= start2) && (end1 >= end2);
316}
317
318/*
319 * is l1 and l2 intersecting?
320 * start1 end1
321 * [----------------------------------)
322 * start2 end2
323 * [----------------)
324 */
325static inline int
326lo_seg_intersecting(struct pnfs_layout_range *l1,
327 struct pnfs_layout_range *l2)
328{
329 u64 start1 = l1->offset;
330 u64 end1 = end_offset(start1, l1->length);
331 u64 start2 = l2->offset;
332 u64 end2 = end_offset(start2, l2->length);
333
334 return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
335 (end2 == NFS4_MAX_UINT64 || end2 > start1);
336}
337
264static bool 338static bool
265should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 339should_free_lseg(struct pnfs_layout_range *lseg_range,
340 struct pnfs_layout_range *recall_range)
266{ 341{
267 return (recall_iomode == IOMODE_ANY || 342 return (recall_range->iomode == IOMODE_ANY ||
268 lseg_iomode == recall_iomode); 343 lseg_range->iomode == recall_range->iomode) &&
344 lo_seg_intersecting(lseg_range, recall_range);
269} 345}
270 346
271/* Returns 1 if lseg is removed from list, 0 otherwise */ 347/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
296int 372int
297mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 373mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
298 struct list_head *tmp_list, 374 struct list_head *tmp_list,
299 u32 iomode) 375 struct pnfs_layout_range *recall_range)
300{ 376{
301 struct pnfs_layout_segment *lseg, *next; 377 struct pnfs_layout_segment *lseg, *next;
302 int invalid = 0, removed = 0; 378 int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
309 return 0; 385 return 0;
310 } 386 }
311 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 387 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
312 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 388 if (!recall_range ||
389 should_free_lseg(&lseg->pls_range, recall_range)) {
313 dprintk("%s: freeing lseg %p iomode %d " 390 dprintk("%s: freeing lseg %p iomode %d "
314 "offset %llu length %llu\n", __func__, 391 "offset %llu length %llu\n", __func__,
315 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 392 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
358 lo = nfsi->layout; 435 lo = nfsi->layout;
359 if (lo) { 436 if (lo) {
360 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 437 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
361 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 438 mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
362 } 439 }
363 spin_unlock(&nfsi->vfs_inode.i_lock); 440 spin_unlock(&nfsi->vfs_inode.i_lock);
364 pnfs_free_lseg_list(&tmp_list); 441 pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
467static struct pnfs_layout_segment * 544static struct pnfs_layout_segment *
468send_layoutget(struct pnfs_layout_hdr *lo, 545send_layoutget(struct pnfs_layout_hdr *lo,
469 struct nfs_open_context *ctx, 546 struct nfs_open_context *ctx,
470 u32 iomode, 547 struct pnfs_layout_range *range,
471 gfp_t gfp_flags) 548 gfp_t gfp_flags)
472{ 549{
473 struct inode *ino = lo->plh_inode; 550 struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
499 goto out_err_free; 576 goto out_err_free;
500 } 577 }
501 578
502 lgp->args.minlength = NFS4_MAX_UINT64; 579 lgp->args.minlength = PAGE_CACHE_SIZE;
580 if (lgp->args.minlength > range->length)
581 lgp->args.minlength = range->length;
503 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 582 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
504 lgp->args.range.iomode = iomode; 583 lgp->args.range = *range;
505 lgp->args.range.offset = 0;
506 lgp->args.range.length = NFS4_MAX_UINT64;
507 lgp->args.type = server->pnfs_curr_ld->id; 584 lgp->args.type = server->pnfs_curr_ld->id;
508 lgp->args.inode = ino; 585 lgp->args.inode = ino;
509 lgp->args.ctx = get_nfs_open_context(ctx); 586 lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
518 nfs4_proc_layoutget(lgp); 595 nfs4_proc_layoutget(lgp);
519 if (!lseg) { 596 if (!lseg) {
520 /* remember that LAYOUTGET failed and suspend trying */ 597 /* remember that LAYOUTGET failed and suspend trying */
521 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 598 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
522 } 599 }
523 600
524 /* free xdr pages */ 601 /* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
542 return NULL; 619 return NULL;
543} 620}
544 621
622/* Initiates a LAYOUTRETURN(FILE) */
623int
624_pnfs_return_layout(struct inode *ino)
625{
626 struct pnfs_layout_hdr *lo = NULL;
627 struct nfs_inode *nfsi = NFS_I(ino);
628 LIST_HEAD(tmp_list);
629 struct nfs4_layoutreturn *lrp;
630 nfs4_stateid stateid;
631 int status = 0;
632
633 dprintk("--> %s\n", __func__);
634
635 spin_lock(&ino->i_lock);
636 lo = nfsi->layout;
637 if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
638 spin_unlock(&ino->i_lock);
639 dprintk("%s: no layout segments to return\n", __func__);
640 goto out;
641 }
642 stateid = nfsi->layout->plh_stateid;
643 /* Reference matched in nfs4_layoutreturn_release */
644 get_layout_hdr(lo);
645 spin_unlock(&ino->i_lock);
646 pnfs_free_lseg_list(&tmp_list);
647
648 WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
649
650 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
651 if (unlikely(lrp == NULL)) {
652 status = -ENOMEM;
653 goto out;
654 }
655
656 lrp->args.stateid = stateid;
657 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
658 lrp->args.inode = ino;
659 lrp->clp = NFS_SERVER(ino)->nfs_client;
660
661 status = nfs4_proc_layoutreturn(lrp);
662out:
663 dprintk("<-- %s status: %d\n", __func__, status);
664 return status;
665}
666
545bool pnfs_roc(struct inode *ino) 667bool pnfs_roc(struct inode *ino)
546{ 668{
547 struct pnfs_layout_hdr *lo; 669 struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
625 * are seen first. 747 * are seen first.
626 */ 748 */
627static s64 749static s64
628cmp_layout(u32 iomode1, u32 iomode2) 750cmp_layout(struct pnfs_layout_range *l1,
751 struct pnfs_layout_range *l2)
629{ 752{
753 s64 d;
754
755 /* high offset > low offset */
756 d = l1->offset - l2->offset;
757 if (d)
758 return d;
759
760 /* short length > long length */
761 d = l2->length - l1->length;
762 if (d)
763 return d;
764
630 /* read > read/write */ 765 /* read > read/write */
631 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); 766 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
632} 767}
633 768
634static void 769static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
636 struct pnfs_layout_segment *lseg) 771 struct pnfs_layout_segment *lseg)
637{ 772{
638 struct pnfs_layout_segment *lp; 773 struct pnfs_layout_segment *lp;
639 int found = 0;
640 774
641 dprintk("%s:Begin\n", __func__); 775 dprintk("%s:Begin\n", __func__);
642 776
643 assert_spin_locked(&lo->plh_inode->i_lock); 777 assert_spin_locked(&lo->plh_inode->i_lock);
644 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 778 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
645 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) 779 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
646 continue; 780 continue;
647 list_add_tail(&lseg->pls_list, &lp->pls_list); 781 list_add_tail(&lseg->pls_list, &lp->pls_list);
648 dprintk("%s: inserted lseg %p " 782 dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
652 lseg->pls_range.offset, lseg->pls_range.length, 786 lseg->pls_range.offset, lseg->pls_range.length,
653 lp, lp->pls_range.iomode, lp->pls_range.offset, 787 lp, lp->pls_range.iomode, lp->pls_range.offset,
654 lp->pls_range.length); 788 lp->pls_range.length);
655 found = 1; 789 goto out;
656 break;
657 }
658 if (!found) {
659 list_add_tail(&lseg->pls_list, &lo->plh_segs);
660 dprintk("%s: inserted lseg %p "
661 "iomode %d offset %llu length %llu at tail\n",
662 __func__, lseg, lseg->pls_range.iomode,
663 lseg->pls_range.offset, lseg->pls_range.length);
664 } 790 }
791 list_add_tail(&lseg->pls_list, &lo->plh_segs);
792 dprintk("%s: inserted lseg %p "
793 "iomode %d offset %llu length %llu at tail\n",
794 __func__, lseg, lseg->pls_range.iomode,
795 lseg->pls_range.offset, lseg->pls_range.length);
796out:
665 get_layout_hdr(lo); 797 get_layout_hdr(lo);
666 798
667 dprintk("%s:Return\n", __func__); 799 dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
672{ 804{
673 struct pnfs_layout_hdr *lo; 805 struct pnfs_layout_hdr *lo;
674 806
675 lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); 807 lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
676 if (!lo) 808 if (!lo)
677 return NULL; 809 return NULL;
678 atomic_set(&lo->plh_refcount, 1); 810 atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
705 if (likely(nfsi->layout == NULL)) /* Won the race? */ 837 if (likely(nfsi->layout == NULL)) /* Won the race? */
706 nfsi->layout = new; 838 nfsi->layout = new;
707 else 839 else
708 kfree(new); 840 pnfs_free_layout_hdr(new);
709 return nfsi->layout; 841 return nfsi->layout;
710} 842}
711 843
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
721 * READ RW true 853 * READ RW true
722 */ 854 */
723static int 855static int
724is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 856is_matching_lseg(struct pnfs_layout_range *ls_range,
857 struct pnfs_layout_range *range)
725{ 858{
726 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); 859 struct pnfs_layout_range range1;
860
861 if ((range->iomode == IOMODE_RW &&
862 ls_range->iomode != IOMODE_RW) ||
863 !lo_seg_intersecting(ls_range, range))
864 return 0;
865
866 /* range1 covers only the first byte in the range */
867 range1 = *range;
868 range1.length = 1;
869 return lo_seg_contained(ls_range, &range1);
727} 870}
728 871
729/* 872/*
730 * lookup range in layout 873 * lookup range in layout
731 */ 874 */
732static struct pnfs_layout_segment * 875static struct pnfs_layout_segment *
733pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) 876pnfs_find_lseg(struct pnfs_layout_hdr *lo,
877 struct pnfs_layout_range *range)
734{ 878{
735 struct pnfs_layout_segment *lseg, *ret = NULL; 879 struct pnfs_layout_segment *lseg, *ret = NULL;
736 880
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
739 assert_spin_locked(&lo->plh_inode->i_lock); 883 assert_spin_locked(&lo->plh_inode->i_lock);
740 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 884 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
741 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 885 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
742 is_matching_lseg(lseg, iomode)) { 886 is_matching_lseg(&lseg->pls_range, range)) {
743 ret = get_lseg(lseg); 887 ret = get_lseg(lseg);
744 break; 888 break;
745 } 889 }
746 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 890 if (cmp_layout(range, &lseg->pls_range) > 0)
747 break; 891 break;
748 } 892 }
749 893
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
759struct pnfs_layout_segment * 903struct pnfs_layout_segment *
760pnfs_update_layout(struct inode *ino, 904pnfs_update_layout(struct inode *ino,
761 struct nfs_open_context *ctx, 905 struct nfs_open_context *ctx,
906 loff_t pos,
907 u64 count,
762 enum pnfs_iomode iomode, 908 enum pnfs_iomode iomode,
763 gfp_t gfp_flags) 909 gfp_t gfp_flags)
764{ 910{
911 struct pnfs_layout_range arg = {
912 .iomode = iomode,
913 .offset = pos,
914 .length = count,
915 };
916 unsigned pg_offset;
765 struct nfs_inode *nfsi = NFS_I(ino); 917 struct nfs_inode *nfsi = NFS_I(ino);
766 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 918 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
767 struct pnfs_layout_hdr *lo; 919 struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
789 goto out_unlock; 941 goto out_unlock;
790 942
791 /* Check to see if the layout for the given range already exists */ 943 /* Check to see if the layout for the given range already exists */
792 lseg = pnfs_find_lseg(lo, iomode); 944 lseg = pnfs_find_lseg(lo, &arg);
793 if (lseg) 945 if (lseg)
794 goto out_unlock; 946 goto out_unlock;
795 947
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
811 spin_unlock(&clp->cl_lock); 963 spin_unlock(&clp->cl_lock);
812 } 964 }
813 965
814 lseg = send_layoutget(lo, ctx, iomode, gfp_flags); 966 pg_offset = arg.offset & ~PAGE_CACHE_MASK;
967 if (pg_offset) {
968 arg.offset -= pg_offset;
969 arg.length += pg_offset;
970 }
971 arg.length = PAGE_CACHE_ALIGN(arg.length);
972
973 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
815 if (!lseg && first) { 974 if (!lseg && first) {
816 spin_lock(&clp->cl_lock); 975 spin_lock(&clp->cl_lock);
817 list_del_init(&lo->plh_layouts); 976 list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
838 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 997 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
839 int status = 0; 998 int status = 0;
840 999
841 /* Verify we got what we asked for.
842 * Note that because the xdr parsing only accepts a single
843 * element array, this can fail even if the server is behaving
844 * correctly.
845 */
846 if (lgp->args.range.iomode > res->range.iomode ||
847 res->range.offset != 0 ||
848 res->range.length != NFS4_MAX_UINT64) {
849 status = -EINVAL;
850 goto out;
851 }
852 /* Inject layout blob into I/O device driver */ 1000 /* Inject layout blob into I/O device driver */
853 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1001 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
854 if (!lseg || IS_ERR(lseg)) { 1002 if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
895 goto out; 1043 goto out;
896} 1044}
897 1045
898static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, 1046bool
899 struct nfs_page *prev, 1047pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
900 struct nfs_page *req) 1048 struct nfs_page *req)
901{ 1049{
1050 enum pnfs_iomode access_type;
1051 gfp_t gfp_flags;
1052
1053 /* We assume that pg_ioflags == 0 iff we're reading a page */
1054 if (pgio->pg_ioflags == 0) {
1055 access_type = IOMODE_READ;
1056 gfp_flags = GFP_KERNEL;
1057 } else {
1058 access_type = IOMODE_RW;
1059 gfp_flags = GFP_NOFS;
1060 }
1061
902 if (pgio->pg_count == prev->wb_bytes) { 1062 if (pgio->pg_count == prev->wb_bytes) {
903 /* This is first coelesce call for a series of nfs_pages */ 1063 /* This is first coelesce call for a series of nfs_pages */
904 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1064 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
905 prev->wb_context, 1065 prev->wb_context,
906 IOMODE_READ, 1066 req_offset(req),
907 GFP_KERNEL); 1067 pgio->pg_count,
1068 access_type,
1069 gfp_flags);
1070 return true;
908 } 1071 }
909 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
910}
911 1072
912void 1073 if (pgio->pg_lseg &&
913pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1074 req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
914{ 1075 pgio->pg_lseg->pls_range.length))
915 struct pnfs_layoutdriver_type *ld; 1076 return false;
916 1077
917 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1078 return true;
918 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
919} 1079}
1080EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
920 1081
921static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, 1082/*
922 struct nfs_page *prev, 1083 * Called by non rpc-based layout drivers
923 struct nfs_page *req) 1084 */
1085int
1086pnfs_ld_write_done(struct nfs_write_data *data)
924{ 1087{
925 if (pgio->pg_count == prev->wb_bytes) { 1088 int status;
926 /* This is first coelesce call for a series of nfs_pages */
927 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
928 prev->wb_context,
929 IOMODE_RW,
930 GFP_NOFS);
931 }
932 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
933}
934 1089
935void 1090 if (!data->pnfs_error) {
936pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1091 pnfs_set_layoutcommit(data);
937{ 1092 data->mds_ops->rpc_call_done(&data->task, data);
938 struct pnfs_layoutdriver_type *ld; 1093 data->mds_ops->rpc_release(data);
1094 return 0;
1095 }
939 1096
940 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1097 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
941 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; 1098 data->pnfs_error);
1099 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1100 data->mds_ops, NFS_FILE_SYNC);
1101 return status ? : -EAGAIN;
942} 1102}
1103EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
943 1104
944enum pnfs_try_status 1105enum pnfs_try_status
945pnfs_try_to_write_data(struct nfs_write_data *wdata, 1106pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
966} 1127}
967 1128
968/* 1129/*
1130 * Called by non rpc-based layout drivers
1131 */
1132int
1133pnfs_ld_read_done(struct nfs_read_data *data)
1134{
1135 int status;
1136
1137 if (!data->pnfs_error) {
1138 __nfs4_read_done_cb(data);
1139 data->mds_ops->rpc_call_done(&data->task, data);
1140 data->mds_ops->rpc_release(data);
1141 return 0;
1142 }
1143
1144 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1145 data->pnfs_error);
1146 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1147 data->mds_ops);
1148 return status ? : -EAGAIN;
1149}
1150EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1151
1152/*
969 * Call the appropriate parallel I/O subsystem read function. 1153 * Call the appropriate parallel I/O subsystem read function.
970 */ 1154 */
971enum pnfs_try_status 1155enum pnfs_try_status
@@ -1009,7 +1193,7 @@ void
1009pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1193pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1010{ 1194{
1011 struct nfs_inode *nfsi = NFS_I(wdata->inode); 1195 struct nfs_inode *nfsi = NFS_I(wdata->inode);
1012 loff_t end_pos = wdata->args.offset + wdata->res.count; 1196 loff_t end_pos = wdata->mds_offset + wdata->res.count;
1013 bool mark_as_dirty = false; 1197 bool mark_as_dirty = false;
1014 1198
1015 spin_lock(&nfsi->vfs_inode.i_lock); 1199 spin_lock(&nfsi->vfs_inode.i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7a..48d0a8e4d062 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 34#include <linux/nfs_page.h>
34 35
35enum { 36enum {
@@ -64,17 +65,29 @@ enum {
64 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ 65 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
65}; 66};
66 67
68enum layoutdriver_policy_flags {
69 /* Should the pNFS client commit and return the layout upon a setattr */
70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
71};
72
73struct nfs4_deviceid_node;
74
67/* Per-layout driver specific registration structure */ 75/* Per-layout driver specific registration structure */
68struct pnfs_layoutdriver_type { 76struct pnfs_layoutdriver_type {
69 struct list_head pnfs_tblid; 77 struct list_head pnfs_tblid;
70 const u32 id; 78 const u32 id;
71 const char *name; 79 const char *name;
72 struct module *owner; 80 struct module *owner;
81 unsigned flags;
82
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85
73 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 86 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
74 void (*free_lseg) (struct pnfs_layout_segment *lseg); 87 void (*free_lseg) (struct pnfs_layout_segment *lseg);
75 88
76 /* test for nfs page cache coalescing */ 89 /* test for nfs page cache coalescing */
77 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 90 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
78 91
79 /* Returns true if layoutdriver wants to divert this request to 92 /* Returns true if layoutdriver wants to divert this request to
80 * driver's commit routine. 93 * driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
89 */ 102 */
90 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); 103 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
91 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); 104 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
105
106 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
107
108 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
109 struct xdr_stream *xdr,
110 const struct nfs4_layoutreturn_args *args);
111
112 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
113 struct xdr_stream *xdr,
114 const struct nfs4_layoutcommit_args *args);
92}; 115};
93 116
94struct pnfs_layout_hdr { 117struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
120extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
121 struct pnfs_device *dev); 144 struct pnfs_device *dev);
122extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
123 147
124/* pnfs.c */ 148/* pnfs.c */
125void get_layout_hdr(struct pnfs_layout_hdr *lo); 149void get_layout_hdr(struct pnfs_layout_hdr *lo);
126void put_lseg(struct pnfs_layout_segment *lseg); 150void put_lseg(struct pnfs_layout_segment *lseg);
127struct pnfs_layout_segment * 151struct pnfs_layout_segment *
128pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 152pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
129 enum pnfs_iomode access_type, gfp_t gfp_flags); 153 loff_t pos, u64 count, enum pnfs_iomode access_type,
154 gfp_t gfp_flags);
130void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 155void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
131void unset_pnfs_layoutdriver(struct nfs_server *); 156void unset_pnfs_layoutdriver(struct nfs_server *);
132enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, 157enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
133 const struct rpc_call_ops *, int); 158 const struct rpc_call_ops *, int);
134enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, 159enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
135 const struct rpc_call_ops *); 160 const struct rpc_call_ops *);
136void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); 161bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
137void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
138int pnfs_layout_process(struct nfs4_layoutget *lgp); 162int pnfs_layout_process(struct nfs4_layoutget *lgp);
139void pnfs_free_lseg_list(struct list_head *tmp_list); 163void pnfs_free_lseg_list(struct list_head *tmp_list);
140void pnfs_destroy_layout(struct nfs_inode *); 164void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
148 struct nfs4_state *open_state); 172 struct nfs4_state *open_state);
149int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 173int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
150 struct list_head *tmp_list, 174 struct list_head *tmp_list,
151 u32 iomode); 175 struct pnfs_layout_range *recall_range);
152bool pnfs_roc(struct inode *ino); 176bool pnfs_roc(struct inode *ino);
153void pnfs_roc_release(struct inode *ino); 177void pnfs_roc_release(struct inode *ino);
154void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 178void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
155bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 179bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
156void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 180void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
157int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 181int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
182int _pnfs_return_layout(struct inode *);
183int pnfs_ld_write_done(struct nfs_write_data *);
184int pnfs_ld_read_done(struct nfs_read_data *);
185
186/* pnfs_dev.c */
187struct nfs4_deviceid_node {
188 struct hlist_node node;
189 const struct pnfs_layoutdriver_type *ld;
190 const struct nfs_client *nfs_client;
191 struct nfs4_deviceid deviceid;
192 atomic_t ref;
193};
194
195void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
196struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
197struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
198void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
199void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
200 const struct pnfs_layoutdriver_type *,
201 const struct nfs_client *,
202 const struct nfs4_deviceid *);
203struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
204bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
205void nfs4_deviceid_purge_client(const struct nfs_client *);
158 206
159static inline int lo_fail_bit(u32 iomode) 207static inline int lo_fail_bit(u32 iomode)
160{ 208{
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
223 put_lseg(req->wb_commit_lseg); 271 put_lseg(req->wb_commit_lseg);
224} 272}
225 273
274/* Should the pNFS client commit and return the layout upon a setattr */
275static inline bool
276pnfs_ld_layoutret_on_setattr(struct inode *inode)
277{
278 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
279 return false;
280 return NFS_SERVER(inode)->pnfs_curr_ld->flags &
281 PNFS_LAYOUTRET_ON_SETATTR;
282}
283
284static inline int pnfs_return_layout(struct inode *ino)
285{
286 struct nfs_inode *nfsi = NFS_I(ino);
287 struct nfs_server *nfss = NFS_SERVER(ino);
288
289 if (pnfs_enabled_sb(nfss) && nfsi->layout)
290 return _pnfs_return_layout(ino);
291
292 return 0;
293}
294
295static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
296 struct inode *inode)
297{
298 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
299
300 if (ld)
301 pgio->pg_test = ld->pg_test;
302}
303
226#else /* CONFIG_NFS_V4_1 */ 304#else /* CONFIG_NFS_V4_1 */
227 305
228static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 306static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
245 323
246static inline struct pnfs_layout_segment * 324static inline struct pnfs_layout_segment *
247pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 325pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
248 enum pnfs_iomode access_type, gfp_t gfp_flags) 326 loff_t pos, u64 count, enum pnfs_iomode access_type,
327 gfp_t gfp_flags)
249{ 328{
250 return NULL; 329 return NULL;
251} 330}
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
264 return PNFS_NOT_ATTEMPTED; 343 return PNFS_NOT_ATTEMPTED;
265} 344}
266 345
346static inline int pnfs_return_layout(struct inode *ino)
347{
348 return 0;
349}
350
351static inline bool
352pnfs_ld_layoutret_on_setattr(struct inode *inode)
353{
354 return false;
355}
356
267static inline bool 357static inline bool
268pnfs_roc(struct inode *ino) 358pnfs_roc(struct inode *ino)
269{ 359{
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
294{ 384{
295} 385}
296 386
297static inline void 387static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
298pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) 388 struct inode *inode)
299{
300 pgio->pg_test = NULL;
301}
302
303static inline void
304pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
305{ 389{
306 pgio->pg_test = NULL;
307} 390}
308 391
309static inline void 392static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
331{ 414{
332 return 0; 415 return 0;
333} 416}
417
418static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
419{
420}
334#endif /* CONFIG_NFS_V4_1 */ 421#endif /* CONFIG_NFS_V4_1 */
335 422
336#endif /* FS_NFS_PNFS_H */ 423#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000000..c65e133ce9c0
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
1/*
2 * Device operations for the pnfs client.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include "pnfs.h"
32
33#define NFSDBG_FACILITY NFSDBG_PNFS
34
35/*
36 * Device ID RCU cache. A device ID is unique per server and layout type.
37 */
38#define NFS4_DEVICE_ID_HASH_BITS 5
39#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
40#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
41
42static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
43static DEFINE_SPINLOCK(nfs4_deviceid_lock);
44
45void
46nfs4_print_deviceid(const struct nfs4_deviceid *id)
47{
48 u32 *p = (u32 *)id;
49
50 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
51 p[0], p[1], p[2], p[3]);
52}
53EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
54
55static inline u32
56nfs4_deviceid_hash(const struct nfs4_deviceid *id)
57{
58 unsigned char *cptr = (unsigned char *)id->data;
59 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
60 u32 x = 0;
61
62 while (nbytes--) {
63 x *= 37;
64 x += *cptr++;
65 }
66 return x & NFS4_DEVICE_ID_HASH_MASK;
67}
68
69static struct nfs4_deviceid_node *
70_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
71 const struct nfs_client *clp, const struct nfs4_deviceid *id,
72 long hash)
73{
74 struct nfs4_deviceid_node *d;
75 struct hlist_node *n;
76
77 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
78 if (d->ld == ld && d->nfs_client == clp &&
79 !memcmp(&d->deviceid, id, sizeof(*id))) {
80 if (atomic_read(&d->ref))
81 return d;
82 else
83 continue;
84 }
85 return NULL;
86}
87
88/*
89 * Lookup a deviceid in cache and get a reference count on it if found
90 *
91 * @clp nfs_client associated with deviceid
92 * @id deviceid to look up
93 */
94struct nfs4_deviceid_node *
95_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
96 const struct nfs_client *clp, const struct nfs4_deviceid *id,
97 long hash)
98{
99 struct nfs4_deviceid_node *d;
100
101 rcu_read_lock();
102 d = _lookup_deviceid(ld, clp, id, hash);
103 if (d && !atomic_inc_not_zero(&d->ref))
104 d = NULL;
105 rcu_read_unlock();
106 return d;
107}
108
109struct nfs4_deviceid_node *
110nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111 const struct nfs_client *clp, const struct nfs4_deviceid *id)
112{
113 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
114}
115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
116
117/*
118 * Unhash and put deviceid
119 *
120 * @clp nfs_client associated with deviceid
121 * @id the deviceid to unhash
122 *
123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
124 */
125struct nfs4_deviceid_node *
126nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
127 const struct nfs_client *clp, const struct nfs4_deviceid *id)
128{
129 struct nfs4_deviceid_node *d;
130
131 spin_lock(&nfs4_deviceid_lock);
132 rcu_read_lock();
133 d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
134 rcu_read_unlock();
135 if (!d) {
136 spin_unlock(&nfs4_deviceid_lock);
137 return NULL;
138 }
139 hlist_del_init_rcu(&d->node);
140 spin_unlock(&nfs4_deviceid_lock);
141 synchronize_rcu();
142
143 /* balance the initial ref set in pnfs_insert_deviceid */
144 if (atomic_dec_and_test(&d->ref))
145 return d;
146
147 return NULL;
148}
149EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
150
151/*
152 * Delete a deviceid from cache
153 *
154 * @clp struct nfs_client qualifying the deviceid
155 * @id deviceid to delete
156 */
157void
158nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
159 const struct nfs_client *clp, const struct nfs4_deviceid *id)
160{
161 struct nfs4_deviceid_node *d;
162
163 d = nfs4_unhash_put_deviceid(ld, clp, id);
164 if (!d)
165 return;
166 d->ld->free_deviceid_node(d);
167}
168EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
169
170void
171nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
172 const struct pnfs_layoutdriver_type *ld,
173 const struct nfs_client *nfs_client,
174 const struct nfs4_deviceid *id)
175{
176 INIT_HLIST_NODE(&d->node);
177 d->ld = ld;
178 d->nfs_client = nfs_client;
179 d->deviceid = *id;
180 atomic_set(&d->ref, 1);
181}
182EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
183
184/*
185 * Uniquely initialize and insert a deviceid node into cache
186 *
187 * @new new deviceid node
188 * Note that the caller must set up the following members:
189 * new->ld
190 * new->nfs_client
191 * new->deviceid
192 *
193 * @ret the inserted node, if none found, otherwise, the found entry.
194 */
195struct nfs4_deviceid_node *
196nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
197{
198 struct nfs4_deviceid_node *d;
199 long hash;
200
201 spin_lock(&nfs4_deviceid_lock);
202 hash = nfs4_deviceid_hash(&new->deviceid);
203 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
204 if (d) {
205 spin_unlock(&nfs4_deviceid_lock);
206 return d;
207 }
208
209 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
210 spin_unlock(&nfs4_deviceid_lock);
211
212 return new;
213}
214EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
215
216/*
217 * Dereference a deviceid node and delete it when its reference count drops
218 * to zero.
219 *
220 * @d deviceid node to put
221 *
222 * @ret true iff the node was deleted
223 */
224bool
225nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
226{
227 if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
228 return false;
229 hlist_del_init_rcu(&d->node);
230 spin_unlock(&nfs4_deviceid_lock);
231 synchronize_rcu();
232 d->ld->free_deviceid_node(d);
233 return true;
234}
235EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
236
237static void
238_deviceid_purge_client(const struct nfs_client *clp, long hash)
239{
240 struct nfs4_deviceid_node *d;
241 struct hlist_node *n, *next;
242 HLIST_HEAD(tmp);
243
244 rcu_read_lock();
245 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
246 if (d->nfs_client == clp && atomic_read(&d->ref)) {
247 hlist_del_init_rcu(&d->node);
248 hlist_add_head(&d->node, &tmp);
249 }
250 rcu_read_unlock();
251
252 if (hlist_empty(&tmp))
253 return;
254
255 synchronize_rcu();
256 hlist_for_each_entry_safe(d, n, next, &tmp, node)
257 if (atomic_dec_and_test(&d->ref))
258 d->ld->free_deviceid_node(d);
259}
260
261void
262nfs4_deviceid_purge_client(const struct nfs_client *clp)
263{
264 long h;
265
266 spin_lock(&nfs4_deviceid_lock);
267 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
268 _deviceid_purge_client(clp, h);
269 spin_unlock(&nfs4_deviceid_lock);
270}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a1..20a7f952e244 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
288 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
289 289
290 BUG_ON(desc->pg_lseg != NULL); 290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
292 req_offset(req), desc->pg_count,
293 IOMODE_READ, GFP_KERNEL);
292 ClearPageError(page); 294 ClearPageError(page);
293 offset = 0; 295 offset = 0;
294 nbytes = desc->pg_count; 296 nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
351 } 353 }
352 req = nfs_list_entry(data->pages.next); 354 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages)) 355 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 356 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
357 req_offset(req), desc->pg_count,
358 IOMODE_READ, GFP_KERNEL);
355 359
356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 360 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
357 0, lseg); 361 0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
660 if (ret == 0) 664 if (ret == 0)
661 goto read_complete; /* all pages were read */ 665 goto read_complete; /* all pages were read */
662 666
663 pnfs_pageio_init_read(&pgio, inode);
664 if (rsize < PAGE_CACHE_SIZE) 667 if (rsize < PAGE_CACHE_SIZE)
665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 668 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
666 else 669 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa7..ce40e5c568ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
63#include "iostat.h" 63#include "iostat.h"
64#include "internal.h" 64#include "internal.h"
65#include "fscache.h" 65#include "fscache.h"
66#include "pnfs.h"
66 67
67#define NFSDBG_FACILITY NFSDBG_VFS 68#define NFSDBG_FACILITY NFSDBG_VFS
68 69
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
732 733
733 return 0; 734 return 0;
734} 735}
736#ifdef CONFIG_NFS_V4_1
737void show_sessions(struct seq_file *m, struct nfs_server *server)
738{
739 if (nfs4_has_session(server->nfs_client))
740 seq_printf(m, ",sessions");
741}
742#else
743void show_sessions(struct seq_file *m, struct nfs_server *server) {}
744#endif
745
746#ifdef CONFIG_NFS_V4_1
747void show_pnfs(struct seq_file *m, struct nfs_server *server)
748{
749 seq_printf(m, ",pnfs=");
750 if (server->pnfs_curr_ld)
751 seq_printf(m, "%s", server->pnfs_curr_ld->name);
752 else
753 seq_printf(m, "not configured");
754}
755#else /* CONFIG_NFS_V4_1 */
756void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
757#endif /* CONFIG_NFS_V4_1 */
735 758
736static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) 759static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
737{ 760{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
792 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 815 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
793 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 816 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
794 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 817 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
818 show_sessions(m, nfss);
819 show_pnfs(m, nfss);
795 } 820 }
796#endif 821#endif
797 822
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac92..e268e3b23497 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
939 atomic_set(&req->wb_complete, requests); 939 atomic_set(&req->wb_complete, requests);
940 940
941 BUG_ON(desc->pg_lseg); 941 BUG_ON(desc->pg_lseg);
942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
943 req_offset(req), desc->pg_count,
944 IOMODE_RW, GFP_NOFS);
943 ClearPageError(page); 945 ClearPageError(page);
944 offset = 0; 946 offset = 0;
945 nbytes = desc->pg_count; 947 nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1013 } 1015 }
1014 req = nfs_list_entry(data->pages.next); 1016 req = nfs_list_entry(data->pages.next);
1015 if ((!lseg) && list_is_singular(&data->pages)) 1017 if ((!lseg) && list_is_singular(&data->pages))
1016 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 1018 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1019 req_offset(req), desc->pg_count,
1020 IOMODE_RW, GFP_NOFS);
1017 1021
1018 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1022 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1019 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1023 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1032{ 1036{
1033 size_t wsize = NFS_SERVER(inode)->wsize; 1037 size_t wsize = NFS_SERVER(inode)->wsize;
1034 1038
1035 pnfs_pageio_init_write(pgio, inode);
1036
1037 if (wsize < PAGE_CACHE_SIZE) 1039 if (wsize < PAGE_CACHE_SIZE)
1038 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1040 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
1039 else 1041 else
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a2..b9566e46219f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1354 if (IS_ERR(exp)) 1354 if (IS_ERR(exp))
1355 return nfserrno(PTR_ERR(exp)); 1355 return nfserrno(PTR_ERR(exp));
1356 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); 1356 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
1357 if (rv)
1358 goto out;
1359 rv = check_nfsd_access(exp, rqstp);
1360 if (rv)
1361 fh_put(fhp);
1362out:
1363 exp_put(exp); 1357 exp_put(exp);
1364 return rv; 1358 return rv;
1365} 1359}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e9..9095f3c21df9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
245 } 245 }
246 246
247 /* Now create the file and set attributes */ 247 /* Now create the file and set attributes */
248 nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len, 248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
249 attr, newfhp, 249 attr, newfhp,
250 argp->createmode, argp->verf, NULL, NULL); 250 argp->createmode, argp->verf, NULL, NULL);
251 251
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index ad48faca20fc..08c6e36ab2eb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
842 return rv; 842 return rv;
843} 843}
844 844
845__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) 845static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
846{ 846{
847 struct svc_fh fh; 847 struct svc_fh fh;
848 int err; 848 int err;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e3..3a6dbd70b34b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
196 196
197 /* 197 /*
198 * Note: create modes (UNCHECKED,GUARDED...) are the same 198 * Note: create modes (UNCHECKED,GUARDED...) are the same
199 * in NFSv4 as in v3. 199 * in NFSv4 as in v3 except EXCLUSIVE4_1.
200 */ 200 */
201 status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data, 201 status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
202 open->op_fname.len, &open->op_iattr, 202 open->op_fname.len, &open->op_iattr,
203 &resfh, open->op_createmode, 203 &resfh, open->op_createmode,
204 (u32 *)open->op_verf.data, 204 (u32 *)open->op_verf.data,
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
403 cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; 403 cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
404 memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, 404 memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
405 putfh->pf_fhlen); 405 putfh->pf_fhlen);
406 return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 406 return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
407} 407}
408 408
409static __be32 409static __be32
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
762 __be32 err; 762 __be32 err;
763 763
764 fh_init(&resfh, NFS4_FHSIZE); 764 fh_init(&resfh, NFS4_FHSIZE);
765 err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
766 if (err)
767 return err;
765 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, 768 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
766 secinfo->si_name, secinfo->si_namelen, 769 secinfo->si_name, secinfo->si_namelen,
767 &exp, &dentry); 770 &exp, &dentry);
@@ -986,6 +989,9 @@ enum nfsd4_op_flags {
986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 989 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
987 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ 990 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
988 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ 991 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
992 /* For rfc 5661 section 2.6.3.1.1: */
993 OP_HANDLES_WRONGSEC = 1 << 3,
994 OP_IS_PUTFH_LIKE = 1 << 4,
989}; 995};
990 996
991struct nfsd4_operation { 997struct nfsd4_operation {
@@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
1031 return nfs_ok; 1037 return nfs_ok;
1032} 1038}
1033 1039
1040static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
1041{
1042 return &nfsd4_ops[op->opnum];
1043}
1044
1045static bool need_wrongsec_check(struct svc_rqst *rqstp)
1046{
1047 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1048 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1049 struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
1050 struct nfsd4_op *next = &argp->ops[resp->opcnt];
1051 struct nfsd4_operation *thisd;
1052 struct nfsd4_operation *nextd;
1053
1054 thisd = OPDESC(this);
1055 /*
1056 * Most ops check wronsec on our own; only the putfh-like ops
1057 * have special rules.
1058 */
1059 if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
1060 return false;
1061 /*
1062 * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
1063 * put-filehandle operation if we're not going to use the
1064 * result:
1065 */
1066 if (argp->opcnt == resp->opcnt)
1067 return false;
1068
1069 nextd = OPDESC(next);
1070 /*
1071 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
1072 * errors themselves as necessary; others should check for them
1073 * now:
1074 */
1075 return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
1076}
1077
1034/* 1078/*
1035 * COMPOUND call. 1079 * COMPOUND call.
1036 */ 1080 */
@@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1108 goto encode_op; 1152 goto encode_op;
1109 } 1153 }
1110 1154
1111 opdesc = &nfsd4_ops[op->opnum]; 1155 opdesc = OPDESC(op);
1112 1156
1113 if (!cstate->current_fh.fh_dentry) { 1157 if (!cstate->current_fh.fh_dentry) {
1114 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { 1158 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
@@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1126 else 1170 else
1127 BUG_ON(op->status == nfs_ok); 1171 BUG_ON(op->status == nfs_ok);
1128 1172
1173 if (!op->status && need_wrongsec_check(rqstp))
1174 op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
1175
1129encode_op: 1176encode_op:
1130 /* Only from SEQUENCE */ 1177 /* Only from SEQUENCE */
1131 if (resp->cstate.status == nfserr_replay_cache) { 1178 if (resp->cstate.status == nfserr_replay_cache) {
@@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1217 }, 1264 },
1218 [OP_LOOKUP] = { 1265 [OP_LOOKUP] = {
1219 .op_func = (nfsd4op_func)nfsd4_lookup, 1266 .op_func = (nfsd4op_func)nfsd4_lookup,
1267 .op_flags = OP_HANDLES_WRONGSEC,
1220 .op_name = "OP_LOOKUP", 1268 .op_name = "OP_LOOKUP",
1221 }, 1269 },
1222 [OP_LOOKUPP] = { 1270 [OP_LOOKUPP] = {
1223 .op_func = (nfsd4op_func)nfsd4_lookupp, 1271 .op_func = (nfsd4op_func)nfsd4_lookupp,
1272 .op_flags = OP_HANDLES_WRONGSEC,
1224 .op_name = "OP_LOOKUPP", 1273 .op_name = "OP_LOOKUPP",
1225 }, 1274 },
1226 [OP_NVERIFY] = { 1275 [OP_NVERIFY] = {
@@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1229 }, 1278 },
1230 [OP_OPEN] = { 1279 [OP_OPEN] = {
1231 .op_func = (nfsd4op_func)nfsd4_open, 1280 .op_func = (nfsd4op_func)nfsd4_open,
1281 .op_flags = OP_HANDLES_WRONGSEC,
1232 .op_name = "OP_OPEN", 1282 .op_name = "OP_OPEN",
1233 }, 1283 },
1234 [OP_OPEN_CONFIRM] = { 1284 [OP_OPEN_CONFIRM] = {
@@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
1241 }, 1291 },
1242 [OP_PUTFH] = { 1292 [OP_PUTFH] = {
1243 .op_func = (nfsd4op_func)nfsd4_putfh, 1293 .op_func = (nfsd4op_func)nfsd4_putfh,
1244 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1294 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1295 | OP_IS_PUTFH_LIKE,
1245 .op_name = "OP_PUTFH", 1296 .op_name = "OP_PUTFH",
1246 }, 1297 },
1247 [OP_PUTPUBFH] = { 1298 [OP_PUTPUBFH] = {
1248 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1299 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1249 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1300 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1301 | OP_IS_PUTFH_LIKE,
1250 .op_name = "OP_PUTPUBFH", 1302 .op_name = "OP_PUTPUBFH",
1251 }, 1303 },
1252 [OP_PUTROOTFH] = { 1304 [OP_PUTROOTFH] = {
1253 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1305 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1254 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1306 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1307 | OP_IS_PUTFH_LIKE,
1255 .op_name = "OP_PUTROOTFH", 1308 .op_name = "OP_PUTROOTFH",
1256 }, 1309 },
1257 [OP_READ] = { 1310 [OP_READ] = {
@@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
1281 }, 1334 },
1282 [OP_RESTOREFH] = { 1335 [OP_RESTOREFH] = {
1283 .op_func = (nfsd4op_func)nfsd4_restorefh, 1336 .op_func = (nfsd4op_func)nfsd4_restorefh,
1284 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1337 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1338 | OP_IS_PUTFH_LIKE,
1285 .op_name = "OP_RESTOREFH", 1339 .op_name = "OP_RESTOREFH",
1286 }, 1340 },
1287 [OP_SAVEFH] = { 1341 [OP_SAVEFH] = {
1288 .op_func = (nfsd4op_func)nfsd4_savefh, 1342 .op_func = (nfsd4op_func)nfsd4_savefh,
1343 .op_flags = OP_HANDLES_WRONGSEC,
1289 .op_name = "OP_SAVEFH", 1344 .op_name = "OP_SAVEFH",
1290 }, 1345 },
1291 [OP_SECINFO] = { 1346 [OP_SECINFO] = {
1292 .op_func = (nfsd4op_func)nfsd4_secinfo, 1347 .op_func = (nfsd4op_func)nfsd4_secinfo,
1348 .op_flags = OP_HANDLES_WRONGSEC,
1293 .op_name = "OP_SECINFO", 1349 .op_name = "OP_SECINFO",
1294 }, 1350 },
1295 [OP_SETATTR] = { 1351 [OP_SETATTR] = {
@@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1353 }, 1409 },
1354 [OP_SECINFO_NO_NAME] = { 1410 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, 1411 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1412 .op_flags = OP_HANDLES_WRONGSEC,
1356 .op_name = "OP_SECINFO_NO_NAME", 1413 .op_name = "OP_SECINFO_NO_NAME",
1357 }, 1414 },
1358}; 1415};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4cf04e11c66c..e98f3c2e9492 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1519 bool confirm_me = false; 1519 bool confirm_me = false;
1520 int status = 0; 1520 int status = 0;
1521 1521
1522 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1523 return nfserr_inval;
1524
1522 nfs4_lock_state(); 1525 nfs4_lock_state();
1523 unconf = find_unconfirmed_client(&cr_ses->clientid); 1526 unconf = find_unconfirmed_client(&cr_ses->clientid);
1524 conf = find_confirmed_client(&cr_ses->clientid); 1527 conf = find_confirmed_client(&cr_ses->clientid);
@@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1637 return nfserr_badsession; 1640 return nfserr_badsession;
1638 1641
1639 status = nfsd4_map_bcts_dir(&bcts->dir); 1642 status = nfsd4_map_bcts_dir(&bcts->dir);
1640 nfsd4_new_conn(rqstp, cstate->session, bcts->dir); 1643 if (!status)
1641 return nfs_ok; 1644 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1645 return status;
1642} 1646}
1643 1647
1644static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1648static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1725 return; 1729 return;
1726} 1730}
1727 1731
1732static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
1733{
1734 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1735
1736 return args->opcnt > session->se_fchannel.maxops;
1737}
1738
1728__be32 1739__be32
1729nfsd4_sequence(struct svc_rqst *rqstp, 1740nfsd4_sequence(struct svc_rqst *rqstp,
1730 struct nfsd4_compound_state *cstate, 1741 struct nfsd4_compound_state *cstate,
@@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1753 if (!session) 1764 if (!session)
1754 goto out; 1765 goto out;
1755 1766
1767 status = nfserr_too_many_ops;
1768 if (nfsd4_session_too_many_ops(rqstp, session))
1769 goto out;
1770
1756 status = nfserr_badslot; 1771 status = nfserr_badslot;
1757 if (seq->slotid >= session->se_fchannel.maxreqs) 1772 if (seq->slotid >= session->se_fchannel.maxreqs)
1758 goto out; 1773 goto out;
@@ -1808,6 +1823,8 @@ out:
1808__be32 1823__be32
1809nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) 1824nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1810{ 1825{
1826 int status = 0;
1827
1811 if (rc->rca_one_fs) { 1828 if (rc->rca_one_fs) {
1812 if (!cstate->current_fh.fh_dentry) 1829 if (!cstate->current_fh.fh_dentry)
1813 return nfserr_nofilehandle; 1830 return nfserr_nofilehandle;
@@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
1817 */ 1834 */
1818 return nfs_ok; 1835 return nfs_ok;
1819 } 1836 }
1837
1820 nfs4_lock_state(); 1838 nfs4_lock_state();
1821 if (is_client_expired(cstate->session->se_client)) { 1839 status = nfserr_complete_already;
1822 nfs4_unlock_state(); 1840 if (cstate->session->se_client->cl_firststate)
1841 goto out;
1842
1843 status = nfserr_stale_clientid;
1844 if (is_client_expired(cstate->session->se_client))
1823 /* 1845 /*
1824 * The following error isn't really legal. 1846 * The following error isn't really legal.
1825 * But we only get here if the client just explicitly 1847 * But we only get here if the client just explicitly
@@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
1827 * error it gets back on an operation for the dead 1849 * error it gets back on an operation for the dead
1828 * client. 1850 * client.
1829 */ 1851 */
1830 return nfserr_stale_clientid; 1852 goto out;
1831 } 1853
1854 status = nfs_ok;
1832 nfsd4_create_clid_dir(cstate->session->se_client); 1855 nfsd4_create_clid_dir(cstate->session->se_client);
1856out:
1833 nfs4_unlock_state(); 1857 nfs4_unlock_state();
1834 return nfs_ok; 1858 return status;
1835} 1859}
1836 1860
1837__be32 1861__be32
@@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2462 return NULL; 2486 return NULL;
2463} 2487}
2464 2488
2465int share_access_to_flags(u32 share_access) 2489static int share_access_to_flags(u32 share_access)
2466{ 2490{
2467 share_access &= ~NFS4_SHARE_WANT_MASK; 2491 share_access &= ~NFS4_SHARE_WANT_MASK;
2468 2492
@@ -2882,7 +2906,7 @@ out:
2882 return status; 2906 return status;
2883} 2907}
2884 2908
2885struct lock_manager nfsd4_manager = { 2909static struct lock_manager nfsd4_manager = {
2886}; 2910};
2887 2911
2888static void 2912static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d98..990181103214 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) 424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{ 425{
426 DECODE_HEAD; 426 DECODE_HEAD;
427 u32 dummy;
428 427
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); 428 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); 429 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir); 430 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we 431 /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
433 * should be using ctsa_use_conn_in_rdma_mode: */ 432 * could help us figure out we should be using it. */
434 READ32(dummy);
435
436 DECODE_TAIL; 433 DECODE_TAIL;
437} 434}
438 435
@@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
588 READ_BUF(lockt->lt_owner.len); 585 READ_BUF(lockt->lt_owner.len);
589 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 586 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
590 587
591 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
592 return nfserr_inval;
593 DECODE_TAIL; 588 DECODE_TAIL;
594} 589}
595 590
@@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3120 return nfserr; 3115 return nfserr;
3121} 3116}
3122 3117
3123__be32 3118static __be32
3124nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, 3119nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3125 struct nfsd4_sequence *seq) 3120 struct nfsd4_sequence *seq)
3126{ 3121{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0be..90c6aa6d5e0f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
344 * which clients virtually always use auth_sys for, 344 * which clients virtually always use auth_sys for,
345 * even while using RPCSEC_GSS for NFS. 345 * even while using RPCSEC_GSS for NFS.
346 */ 346 */
347 if (access & NFSD_MAY_LOCK) 347 if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
348 goto skip_pseudoflavor_check; 348 goto skip_pseudoflavor_check;
349 /* 349 /*
350 * Clients may expect to be able to use auth_sys during mount, 350 * Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 129f3c9f62d5..d5718273bb32 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
181 struct svc_export *exp; 181 struct svc_export *exp;
182 struct dentry *dparent; 182 struct dentry *dparent;
183 struct dentry *dentry; 183 struct dentry *dentry;
184 __be32 err;
185 int host_err; 184 int host_err;
186 185
187 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 186 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
188 187
189 /* Obtain dentry and export. */
190 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
191 if (err)
192 return err;
193
194 dparent = fhp->fh_dentry; 188 dparent = fhp->fh_dentry;
195 exp = fhp->fh_export; 189 exp = fhp->fh_export;
196 exp_get(exp); 190 exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
254 struct dentry *dentry; 248 struct dentry *dentry;
255 __be32 err; 249 __be32 err;
256 250
251 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
252 if (err)
253 return err;
257 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); 254 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
258 if (err) 255 if (err)
259 return err; 256 return err;
@@ -877,13 +874,11 @@ static __be32
877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 874nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 875 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
879{ 876{
880 struct inode *inode;
881 mm_segment_t oldfs; 877 mm_segment_t oldfs;
882 __be32 err; 878 __be32 err;
883 int host_err; 879 int host_err;
884 880
885 err = nfserr_perm; 881 err = nfserr_perm;
886 inode = file->f_path.dentry->d_inode;
887 882
888 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 883 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
889 struct splice_desc sd = { 884 struct splice_desc sd = {
@@ -1340,11 +1335,18 @@ out_nfserr:
1340} 1335}
1341 1336
1342#ifdef CONFIG_NFSD_V3 1337#ifdef CONFIG_NFSD_V3
1338
1339static inline int nfsd_create_is_exclusive(int createmode)
1340{
1341 return createmode == NFS3_CREATE_EXCLUSIVE
1342 || createmode == NFS4_CREATE_EXCLUSIVE4_1;
1343}
1344
1343/* 1345/*
1344 * NFSv3 version of nfsd_create 1346 * NFSv3 and NFSv4 version of nfsd_create
1345 */ 1347 */
1346__be32 1348__be32
1347nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, 1349do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1348 char *fname, int flen, struct iattr *iap, 1350 char *fname, int flen, struct iattr *iap,
1349 struct svc_fh *resfhp, int createmode, u32 *verifier, 1351 struct svc_fh *resfhp, int createmode, u32 *verifier,
1350 int *truncp, int *created) 1352 int *truncp, int *created)
@@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1396 if (err) 1398 if (err)
1397 goto out; 1399 goto out;
1398 1400
1399 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1401 if (nfsd_create_is_exclusive(createmode)) {
1400 /* solaris7 gets confused (bugid 4218508) if these have 1402 /* solaris7 gets confused (bugid 4218508) if these have
1401 * the high bit set, so just clear the high bits. If this is 1403 * the high bit set, so just clear the high bits. If this is
1402 * ever changed to use different attrs for storing the 1404 * ever changed to use different attrs for storing the
@@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1437 && dchild->d_inode->i_atime.tv_sec == v_atime 1439 && dchild->d_inode->i_atime.tv_sec == v_atime
1438 && dchild->d_inode->i_size == 0 ) 1440 && dchild->d_inode->i_size == 0 )
1439 break; 1441 break;
1442 case NFS4_CREATE_EXCLUSIVE4_1:
1443 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1444 && dchild->d_inode->i_atime.tv_sec == v_atime
1445 && dchild->d_inode->i_size == 0 )
1446 goto set_attr;
1440 /* fallthru */ 1447 /* fallthru */
1441 case NFS3_CREATE_GUARDED: 1448 case NFS3_CREATE_GUARDED:
1442 err = nfserr_exist; 1449 err = nfserr_exist;
@@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1455 1462
1456 nfsd_check_ignore_resizing(iap); 1463 nfsd_check_ignore_resizing(iap);
1457 1464
1458 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1465 if (nfsd_create_is_exclusive(createmode)) {
1459 /* Cram the verifier into atime/mtime */ 1466 /* Cram the verifier into atime/mtime */
1460 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1467 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
1461 | ATTR_MTIME_SET|ATTR_ATIME_SET; 1468 | ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2034 struct inode *inode = dentry->d_inode; 2041 struct inode *inode = dentry->d_inode;
2035 int err; 2042 int err;
2036 2043
2037 if (acc == NFSD_MAY_NOP) 2044 if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
2038 return 0; 2045 return 0;
2039#if 0 2046#if 0
2040 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", 2047 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b7..e0bbac04d1dd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,10 +17,14 @@
17#define NFSD_MAY_SATTR 8 17#define NFSD_MAY_SATTR 8
18#define NFSD_MAY_TRUNC 16 18#define NFSD_MAY_TRUNC 16
19#define NFSD_MAY_LOCK 32 19#define NFSD_MAY_LOCK 32
20#define NFSD_MAY_MASK 63
21
22/* extra hints to permission and open routines: */
20#define NFSD_MAY_OWNER_OVERRIDE 64 23#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 24#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23#define NFSD_MAY_NOT_BREAK_LEASE 512 26#define NFSD_MAY_NOT_BREAK_LEASE 512
27#define NFSD_MAY_BYPASS_GSS 1024
24 28
25#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 29#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
26#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 30#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -54,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
54 int type, dev_t rdev, struct svc_fh *res); 58 int type, dev_t rdev, struct svc_fh *res);
55#ifdef CONFIG_NFSD_V3 59#ifdef CONFIG_NFSD_V3
56__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); 60__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
57__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, 61__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
58 char *name, int len, struct iattr *attrs, 62 char *name, int len, struct iattr *attrs,
59 struct svc_fh *res, int createmode, 63 struct svc_fh *res, int createmode,
60 u32 *verifier, int *truncp, int *created); 64 u32 *verifier, int *truncp, int *created);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f18432832..b954878ad6ce 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
917 * construction. This function can be called both as a single operation 917 * construction. This function can be called both as a single operation
918 * and as a part of indivisible file operations. 918 * and as a part of indivisible file operations.
919 */ 919 */
920void nilfs_dirty_inode(struct inode *inode) 920void nilfs_dirty_inode(struct inode *inode, int flags)
921{ 921{
922 struct nilfs_transaction_info ti; 922 struct nilfs_transaction_info ti;
923 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 923 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb744..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
334 struct nilfs_transaction_info ti; 334 struct nilfs_transaction_info ti;
335 int err; 335 int err;
336 336
337 dentry_unhash(dentry);
338
339 err = nilfs_transaction_begin(dir->i_sb, &ti, 0); 337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
340 if (err) 338 if (err)
341 return err; 339 return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
371 struct nilfs_transaction_info ti; 369 struct nilfs_transaction_info ti;
372 int err; 370 int err;
373 371
374 if (new_inode && S_ISDIR(new_inode->i_mode))
375 dentry_unhash(new_dentry);
376
377 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); 372 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
378 if (unlikely(err)) 373 if (unlikely(err))
379 return err; 374 return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80c..f02b9ad43a21 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
269extern int nilfs_inode_dirty(struct inode *); 269extern int nilfs_inode_dirty(struct inode *);
270int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); 270int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
271extern int nilfs_mark_inode_dirty(struct inode *); 271extern int nilfs_mark_inode_dirty(struct inode *);
272extern void nilfs_dirty_inode(struct inode *); 272extern void nilfs_dirty_inode(struct inode *, int flags);
273int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 273int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
274 __u64 start, __u64 len); 274 __u64 start, __u64 len);
275 275
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a1..3b8d3979e03b 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
241 int ret; 241 int ret;
242 242
243 243
244 if (S_ISDIR(inode->i_mode)) { 244 if (S_ISDIR(inode->i_mode) &&
245 dentry_unhash(dentry); 245 !omfs_dir_is_empty(inode))
246 if (!omfs_dir_is_empty(inode)) 246 return -ENOTEMPTY;
247 return -ENOTEMPTY;
248 }
249 247
250 ret = omfs_delete_entry(dentry); 248 ret = omfs_delete_entry(dentry);
251 if (ret) 249 if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
382 int err; 380 int err;
383 381
384 if (new_inode) { 382 if (new_inode) {
385 if (S_ISDIR(new_inode->i_mode))
386 dentry_unhash(new_dentry);
387
388 /* overwriting existing file/dir */ 383 /* overwriting existing file/dir */
389 err = omfs_remove(new_dir, new_dentry); 384 err = omfs_remove(new_dir, new_dentry);
390 if (err) 385 if (err)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index f82e762eeca2..d545e97d99c3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -255,13 +255,7 @@ ssize_t part_discard_alignment_show(struct device *dev,
255 struct device_attribute *attr, char *buf) 255 struct device_attribute *attr, char *buf)
256{ 256{
257 struct hd_struct *p = dev_to_part(dev); 257 struct hd_struct *p = dev_to_part(dev);
258 struct gendisk *disk = dev_to_disk(dev); 258 return sprintf(buf, "%u\n", p->discard_alignment);
259 unsigned int alignment = 0;
260
261 if (disk->queue)
262 alignment = queue_limit_discard_alignment(&disk->queue->limits,
263 p->start_sect);
264 return sprintf(buf, "%u\n", alignment);
265} 259}
266 260
267ssize_t part_stat_show(struct device *dev, 261ssize_t part_stat_show(struct device *dev,
@@ -455,6 +449,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
455 p->start_sect = start; 449 p->start_sect = start;
456 p->alignment_offset = 450 p->alignment_offset =
457 queue_limit_alignment_offset(&disk->queue->limits, start); 451 queue_limit_alignment_offset(&disk->queue->limits, start);
452 p->discard_alignment =
453 queue_limit_discard_alignment(&disk->queue->limits, start);
458 p->nr_sects = len; 454 p->nr_sects = len;
459 p->partno = partno; 455 p->partno = partno;
460 p->policy = get_disk_ro(disk); 456 p->policy = get_disk_ro(disk);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ede550517a6..14def991d9dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
83#include <linux/pid_namespace.h> 83#include <linux/pid_namespace.h>
84#include <linux/fs_struct.h> 84#include <linux/fs_struct.h>
85#include <linux/slab.h> 85#include <linux/slab.h>
86#ifdef CONFIG_HARDWALL
87#include <asm/hardwall.h>
88#endif
86#include "internal.h" 89#include "internal.h"
87 90
88/* NOTE: 91/* NOTE:
@@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2842#ifdef CONFIG_TASK_IO_ACCOUNTING 2845#ifdef CONFIG_TASK_IO_ACCOUNTING
2843 INF("io", S_IRUGO, proc_tgid_io_accounting), 2846 INF("io", S_IRUGO, proc_tgid_io_accounting),
2844#endif 2847#endif
2848#ifdef CONFIG_HARDWALL
2849 INF("hardwall", S_IRUGO, proc_pid_hardwall),
2850#endif
2845}; 2851};
2846 2852
2847static int proc_tgid_base_readdir(struct file * filp, 2853static int proc_tgid_base_readdir(struct file * filp,
@@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = {
3181#ifdef CONFIG_TASK_IO_ACCOUNTING 3187#ifdef CONFIG_TASK_IO_ACCOUNTING
3182 INF("io", S_IRUGO, proc_tid_io_accounting), 3188 INF("io", S_IRUGO, proc_tid_io_accounting),
3183#endif 3189#endif
3190#ifdef CONFIG_HARDWALL
3191 INF("hardwall", S_IRUGO, proc_pid_hardwall),
3192#endif
3184}; 3193};
3185 3194
3186static int proc_tid_base_readdir(struct file * filp, 3195static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d5651..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
831 INITIALIZE_PATH(path); 831 INITIALIZE_PATH(path);
832 struct reiserfs_dir_entry de; 832 struct reiserfs_dir_entry de;
833 833
834 dentry_unhash(dentry);
835
836 /* we will be doing 2 balancings and update 2 stat data, we change quotas 834 /* we will be doing 2 balancings and update 2 stat data, we change quotas
837 * of the owner of the directory and of the owner of the parent directory. 835 * of the owner of the directory and of the owner of the parent directory.
838 * The quota structure is possibly deleted only on last iput => outside 836 * The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1227 unsigned long savelink = 1; 1225 unsigned long savelink = 1;
1228 struct timespec ctime; 1226 struct timespec ctime;
1229 1227
1230 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1231 dentry_unhash(new_dentry);
1232
1233 /* three balancings: (1) old name removal, (2) new name insertion 1228 /* three balancings: (1) old name removal, (2) new name insertion
1234 and (3) maybe "save" link insertion 1229 and (3) maybe "save" link insertion
1235 stat data updates: (1) old directory, 1230 stat data updates: (1) old directory,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c9..aa91089162cb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
568} 568}
569 569
570/* we don't mark inodes dirty, we just log them */ 570/* we don't mark inodes dirty, we just log them */
571static void reiserfs_dirty_inode(struct inode *inode) 571static void reiserfs_dirty_inode(struct inode *inode, int flags)
572{ 572{
573 struct reiserfs_transaction_handle th; 573 struct reiserfs_transaction_handle th;
574 574
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1cd..e8a62f41b458 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
98 98
99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
100 I_MUTEX_CHILD, dir->i_sb); 100 I_MUTEX_CHILD, dir->i_sb);
101 dentry_unhash(dentry);
102 error = dir->i_op->rmdir(dir, dentry); 101 error = dir->i_op->rmdir(dir, dentry);
103 if (!error) 102 if (!error)
104 dentry->d_inode->i_flags |= S_DEAD; 103 dentry->d_inode->i_flags |= S_DEAD;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9b..5e1101ff276f 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
147 * table[0] points to the first inode lookup table metadata block, 147 * table[0] points to the first inode lookup table metadata block,
148 * this should be less than lookup_table_start 148 * this should be less than lookup_table_start
149 */ 149 */
150 if (!IS_ERR(table) && table[0] >= lookup_table_start) { 150 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
151 kfree(table); 151 kfree(table);
152 return ERR_PTR(-EINVAL); 152 return ERR_PTR(-EINVAL);
153 } 153 }
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bfb..0ed6edbc5c71 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
90 * table[0] points to the first fragment table metadata block, this 90 * table[0] points to the first fragment table metadata block, this
91 * should be less than fragment_table_start 91 * should be less than fragment_table_start
92 */ 92 */
93 if (!IS_ERR(table) && table[0] >= fragment_table_start) { 93 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
94 kfree(table); 94 kfree(table);
95 return ERR_PTR(-EINVAL); 95 return ERR_PTR(-EINVAL);
96 } 96 }
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb44..d38ea3dab951 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
93 * table[0] points to the first id lookup table metadata block, this 93 * table[0] points to the first id lookup table metadata block, this
94 * should be less than id_table_start 94 * should be less than id_table_start
95 */ 95 */
96 if (!IS_ERR(table) && table[0] >= id_table_start) { 96 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
97 kfree(table); 97 kfree(table);
98 return ERR_PTR(-EINVAL); 98 return ERR_PTR(-EINVAL);
99 } 99 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee3597..7438850c62d0 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
245 msblk->id_table = NULL; 245 msblk->id_table = NULL;
246 goto failed_mount; 246 goto failed_mount;
247 } 247 }
248 next_table = msblk->id_table[0]; 248 next_table = le64_to_cpu(msblk->id_table[0]);
249 249
250 /* Handle inode lookup table */ 250 /* Handle inode lookup table */
251 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 251 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
261 msblk->inode_lookup_table = NULL; 261 msblk->inode_lookup_table = NULL;
262 goto failed_mount; 262 goto failed_mount;
263 } 263 }
264 next_table = msblk->inode_lookup_table[0]; 264 next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
265 265
266 sb->s_export_op = &squashfs_export_ops; 266 sb->s_export_op = &squashfs_export_ops;
267 267
@@ -286,7 +286,7 @@ handle_fragments:
286 msblk->fragment_index = NULL; 286 msblk->fragment_index = NULL;
287 goto failed_mount; 287 goto failed_mount;
288 } 288 }
289 next_table = msblk->fragment_index[0]; 289 next_table = le64_to_cpu(msblk->fragment_index[0]);
290 290
291check_directory_table: 291check_directory_table:
292 /* Sanity check directory_table */ 292 /* Sanity check directory_table */
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b1..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
196 struct inode *inode = dentry->d_inode; 196 struct inode *inode = dentry->d_inode;
197 int err = -ENOTEMPTY; 197 int err = -ENOTEMPTY;
198 198
199 dentry_unhash(dentry);
200
201 if (sysv_empty_dir(inode)) { 199 if (sysv_empty_dir(inode)) {
202 err = sysv_unlink(dir, dentry); 200 err = sysv_unlink(dir, dentry);
203 if (!err) { 201 if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
224 struct sysv_dir_entry * old_de; 222 struct sysv_dir_entry * old_de;
225 int err = -ENOENT; 223 int err = -ENOENT;
226 224
227 if (new_inode && S_ISDIR(new_inode->i_mode))
228 dentry_unhash(new_dentry);
229
230 old_de = sysv_find_entry(old_dentry, &old_page); 225 old_de = sysv_find_entry(old_dentry, &old_page);
231 if (!old_de) 226 if (!old_de)
232 goto out; 227 goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560d..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
656 struct ubifs_inode *dir_ui = ubifs_inode(dir); 656 struct ubifs_inode *dir_ui = ubifs_inode(dir);
657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
658 658
659 dentry_unhash(dentry);
660
661 /* 659 /*
662 * Budget request settings: deletion direntry, deletion inode and 660 * Budget request settings: deletion direntry, deletion inode and
663 * changing the parent inode. If budgeting fails, go ahead anyway 661 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 976 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
979 struct timespec time; 977 struct timespec time;
980 978
981 if (new_inode && S_ISDIR(new_inode->i_mode))
982 dentry_unhash(new_dentry);
983
984 /* 979 /*
985 * Budget request settings: deletion direntry, new direntry, removing 980 * Budget request settings: deletion direntry, new direntry, removing
986 * the old inode, and changing old and new parent directory inodes. 981 * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 166951e0dcd3..3be645e012c9 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -581,6 +581,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
581 ubifs_assert(wbuf->size % c->min_io_size == 0); 581 ubifs_assert(wbuf->size % c->min_io_size == 0);
582 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 582 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
583 ubifs_assert(!c->ro_media && !c->ro_mount); 583 ubifs_assert(!c->ro_media && !c->ro_mount);
584 ubifs_assert(!c->space_fixup);
584 if (c->leb_size - wbuf->offs >= c->max_write_size) 585 if (c->leb_size - wbuf->offs >= c->max_write_size)
585 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); 586 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size));
586 587
@@ -759,6 +760,7 @@ int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
759 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); 760 ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
760 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); 761 ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
761 ubifs_assert(!c->ro_media && !c->ro_mount); 762 ubifs_assert(!c->ro_media && !c->ro_mount);
763 ubifs_assert(!c->space_fixup);
762 764
763 if (c->ro_error) 765 if (c->ro_error)
764 return -EROFS; 766 return -EROFS;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 34b1679e6e3a..cef0460f4c54 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -669,6 +669,7 @@ out_free:
669 669
670out_release: 670out_release:
671 release_head(c, BASEHD); 671 release_head(c, BASEHD);
672 kfree(dent);
672out_ro: 673out_ro:
673 ubifs_ro_mode(c, err); 674 ubifs_ro_mode(c, err);
674 if (last_reference) 675 if (last_reference)
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index bd644bf587a8..a5422fffbd69 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -674,7 +674,7 @@ static int kill_orphans(struct ubifs_info *c)
674 if (IS_ERR(sleb)) { 674 if (IS_ERR(sleb)) {
675 if (PTR_ERR(sleb) == -EUCLEAN) 675 if (PTR_ERR(sleb) == -EUCLEAN)
676 sleb = ubifs_recover_leb(c, lnum, 0, 676 sleb = ubifs_recover_leb(c, lnum, 0,
677 c->sbuf, 0); 677 c->sbuf, -1);
678 if (IS_ERR(sleb)) { 678 if (IS_ERR(sleb)) {
679 err = PTR_ERR(sleb); 679 err = PTR_ERR(sleb);
680 break; 680 break;
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 731d9e2e7b50..783d8e0beb76 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -564,19 +564,15 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
564} 564}
565 565
566/** 566/**
567 * drop_last_node - drop the last node or group of nodes. 567 * drop_last_group - drop the last group of nodes.
568 * @sleb: scanned LEB information 568 * @sleb: scanned LEB information
569 * @offs: offset of dropped nodes is returned here 569 * @offs: offset of dropped nodes is returned here
570 * @grouped: non-zero if whole group of nodes have to be dropped
571 * 570 *
572 * This is a helper function for 'ubifs_recover_leb()' which drops the last 571 * This is a helper function for 'ubifs_recover_leb()' which drops the last
573 * node of the scanned LEB or the last group of nodes if @grouped is not zero. 572 * group of nodes of the scanned LEB.
574 * This function returns %1 if a node was dropped and %0 otherwise.
575 */ 573 */
576static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) 574static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
577{ 575{
578 int dropped = 0;
579
580 while (!list_empty(&sleb->nodes)) { 576 while (!list_empty(&sleb->nodes)) {
581 struct ubifs_scan_node *snod; 577 struct ubifs_scan_node *snod;
582 struct ubifs_ch *ch; 578 struct ubifs_ch *ch;
@@ -585,17 +581,40 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
585 list); 581 list);
586 ch = snod->node; 582 ch = snod->node;
587 if (ch->group_type != UBIFS_IN_NODE_GROUP) 583 if (ch->group_type != UBIFS_IN_NODE_GROUP)
588 return dropped; 584 break;
589 dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); 585
586 dbg_rcvry("dropping grouped node at %d:%d",
587 sleb->lnum, snod->offs);
588 *offs = snod->offs;
589 list_del(&snod->list);
590 kfree(snod);
591 sleb->nodes_cnt -= 1;
592 }
593}
594
595/**
596 * drop_last_node - drop the last node.
597 * @sleb: scanned LEB information
598 * @offs: offset of dropped nodes is returned here
599 * @grouped: non-zero if whole group of nodes have to be dropped
600 *
601 * This is a helper function for 'ubifs_recover_leb()' which drops the last
602 * node of the scanned LEB.
603 */
604static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
605{
606 struct ubifs_scan_node *snod;
607
608 if (!list_empty(&sleb->nodes)) {
609 snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
610 list);
611
612 dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
590 *offs = snod->offs; 613 *offs = snod->offs;
591 list_del(&snod->list); 614 list_del(&snod->list);
592 kfree(snod); 615 kfree(snod);
593 sleb->nodes_cnt -= 1; 616 sleb->nodes_cnt -= 1;
594 dropped = 1;
595 if (!grouped)
596 break;
597 } 617 }
598 return dropped;
599} 618}
600 619
601/** 620/**
@@ -604,7 +623,8 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
604 * @lnum: LEB number 623 * @lnum: LEB number
605 * @offs: offset 624 * @offs: offset
606 * @sbuf: LEB-sized buffer to use 625 * @sbuf: LEB-sized buffer to use
607 * @grouped: nodes may be grouped for recovery 626 * @jhead: journal head number this LEB belongs to (%-1 if the LEB does not
627 * belong to any journal head)
608 * 628 *
609 * This function does a scan of a LEB, but caters for errors that might have 629 * This function does a scan of a LEB, but caters for errors that might have
610 * been caused by the unclean unmount from which we are attempting to recover. 630 * been caused by the unclean unmount from which we are attempting to recover.
@@ -612,13 +632,14 @@ static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped)
612 * found, and a negative error code in case of failure. 632 * found, and a negative error code in case of failure.
613 */ 633 */
614struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 634struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
615 int offs, void *sbuf, int grouped) 635 int offs, void *sbuf, int jhead)
616{ 636{
617 int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; 637 int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit;
638 int grouped = jhead == -1 ? 0 : c->jheads[jhead].grouped;
618 struct ubifs_scan_leb *sleb; 639 struct ubifs_scan_leb *sleb;
619 void *buf = sbuf + offs; 640 void *buf = sbuf + offs;
620 641
621 dbg_rcvry("%d:%d", lnum, offs); 642 dbg_rcvry("%d:%d, jhead %d, grouped %d", lnum, offs, jhead, grouped);
622 643
623 sleb = ubifs_start_scan(c, lnum, offs, sbuf); 644 sleb = ubifs_start_scan(c, lnum, offs, sbuf);
624 if (IS_ERR(sleb)) 645 if (IS_ERR(sleb))
@@ -635,7 +656,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
635 * Scan quietly until there is an error from which we cannot 656 * Scan quietly until there is an error from which we cannot
636 * recover 657 * recover
637 */ 658 */
638 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); 659 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
639 if (ret == SCANNED_A_NODE) { 660 if (ret == SCANNED_A_NODE) {
640 /* A valid node, and not a padding node */ 661 /* A valid node, and not a padding node */
641 struct ubifs_ch *ch = buf; 662 struct ubifs_ch *ch = buf;
@@ -695,59 +716,62 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
695 * If nodes are grouped, always drop the incomplete group at 716 * If nodes are grouped, always drop the incomplete group at
696 * the end. 717 * the end.
697 */ 718 */
698 drop_last_node(sleb, &offs, 1); 719 drop_last_group(sleb, &offs);
699 720
700 /* 721 if (jhead == GCHD) {
701 * While we are in the middle of the same min. I/O unit keep dropping 722 /*
702 * nodes. So basically, what we want is to make sure that the last min. 723 * If this LEB belongs to the GC head then while we are in the
703 * I/O unit where we saw the corruption is dropped completely with all 724 * middle of the same min. I/O unit keep dropping nodes. So
704 * the uncorrupted node which may possibly sit there. 725 * basically, what we want is to make sure that the last min.
705 * 726 * I/O unit where we saw the corruption is dropped completely
706 * In other words, let's name the min. I/O unit where the corruption 727 * with all the uncorrupted nodes which may possibly sit there.
707 * starts B, and the previous min. I/O unit A. The below code tries to 728 *
708 * deal with a situation when half of B contains valid nodes or the end 729 * In other words, let's name the min. I/O unit where the
709 * of a valid node, and the second half of B contains corrupted data or 730 * corruption starts B, and the previous min. I/O unit A. The
710 * garbage. This means that UBIFS had been writing to B just before the 731 * below code tries to deal with a situation when half of B
711 * power cut happened. I do not know how realistic is this scenario 732 * contains valid nodes or the end of a valid node, and the
712 * that half of the min. I/O unit had been written successfully and the 733 * second half of B contains corrupted data or garbage. This
713 * other half not, but this is possible in our 'failure mode emulation' 734 * means that UBIFS had been writing to B just before the power
714 * infrastructure at least. 735 * cut happened. I do not know how realistic is this scenario
715 * 736 * that half of the min. I/O unit had been written successfully
716 * So what is the problem, why we need to drop those nodes? Whey can't 737 * and the other half not, but this is possible in our 'failure
717 * we just clean-up the second half of B by putting a padding node 738 * mode emulation' infrastructure at least.
718 * there? We can, and this works fine with one exception which was 739 *
719 * reproduced with power cut emulation testing and happens extremely 740 * So what is the problem, why we need to drop those nodes? Why
720 * rarely. The description follows, but it is worth noting that that is 741 * can't we just clean-up the second half of B by putting a
721 * only about the GC head, so we could do this trick only if the bud 742 * padding node there? We can, and this works fine with one
722 * belongs to the GC head, but it does not seem to be worth an 743 * exception which was reproduced with power cut emulation
723 * additional "if" statement. 744 * testing and happens extremely rarely.
724 * 745 *
725 * So, imagine the file-system is full, we run GC which is moving valid 746 * Imagine the file-system is full, we run GC which starts
726 * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head 747 * moving valid nodes from LEB X to LEB Y (obviously, LEB Y is
727 * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X 748 * the current GC head LEB). The @c->gc_lnum is -1, which means
728 * and will try to continue. Imagine that LEB X is currently the 749 * that GC will retain LEB X and will try to continue. Imagine
729 * dirtiest LEB, and the amount of used space in LEB Y is exactly the 750 * that LEB X is currently the dirtiest LEB, and the amount of
730 * same as amount of free space in LEB X. 751 * used space in LEB Y is exactly the same as amount of free
731 * 752 * space in LEB X.
732 * And a power cut happens when nodes are moved from LEB X to LEB Y. We 753 *
733 * are here trying to recover LEB Y which is the GC head LEB. We find 754 * And a power cut happens when nodes are moved from LEB X to
734 * the min. I/O unit B as described above. Then we clean-up LEB Y by 755 * LEB Y. We are here trying to recover LEB Y which is the GC
735 * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function 756 * head LEB. We find the min. I/O unit B as described above.
736 * fails, because it cannot find a dirty LEB which could be GC'd into 757 * Then we clean-up LEB Y by padding min. I/O unit. And later
737 * LEB Y! Even LEB X does not match because the amount of valid nodes 758 * 'ubifs_rcvry_gc_commit()' function fails, because it cannot
738 * there does not fit the free space in LEB Y any more! And this is 759 * find a dirty LEB which could be GC'd into LEB Y! Even LEB X
739 * because of the padding node which we added to LEB Y. The 760 * does not match because the amount of valid nodes there does
740 * user-visible effect of this which I once observed and analysed is 761 * not fit the free space in LEB Y any more! And this is
741 * that we cannot mount the file-system with -ENOSPC error. 762 * because of the padding node which we added to LEB Y. The
742 * 763 * user-visible effect of this which I once observed and
743 * So obviously, to make sure that situation does not happen we should 764 * analysed is that we cannot mount the file-system with
744 * free min. I/O unit B in LEB Y completely and the last used min. I/O 765 * -ENOSPC error.
745 * unit in LEB Y should be A. This is basically what the below code 766 *
746 * tries to do. 767 * So obviously, to make sure that situation does not happen we
747 */ 768 * should free min. I/O unit B in LEB Y completely and the last
748 while (min_io_unit == round_down(offs, c->min_io_size) && 769 * used min. I/O unit in LEB Y should be A. This is basically
749 min_io_unit != offs && 770 * what the below code tries to do.
750 drop_last_node(sleb, &offs, grouped)); 771 */
772 while (offs > min_io_unit)
773 drop_last_node(sleb, &offs);
774 }
751 775
752 buf = sbuf + offs; 776 buf = sbuf + offs;
753 len = c->leb_size - offs; 777 len = c->leb_size - offs;
@@ -881,7 +905,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
881 } 905 }
882 ubifs_scan_destroy(sleb); 906 ubifs_scan_destroy(sleb);
883 } 907 }
884 return ubifs_recover_leb(c, lnum, offs, sbuf, 0); 908 return ubifs_recover_leb(c, lnum, offs, sbuf, -1);
885} 909}
886 910
887/** 911/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 6617280d1679..5e97161ce4d3 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -557,8 +557,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b)
557 * these LEBs could possibly be written to at the power cut 557 * these LEBs could possibly be written to at the power cut
558 * time. 558 * time.
559 */ 559 */
560 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, 560 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, b->bud->jhead);
561 b->bud->jhead != GCHD);
562 else 561 else
563 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); 562 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
564 if (IS_ERR(sleb)) 563 if (IS_ERR(sleb))
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c003236..9e1d05666fed 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,13 +277,18 @@ static int kick_a_thread(void)
277 return 0; 277 return 0;
278} 278}
279 279
280int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) 280int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
281{ 281{
282 int nr = sc->nr_to_scan;
282 int freed, contention = 0; 283 int freed, contention = 0;
283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); 284 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
284 285
285 if (nr == 0) 286 if (nr == 0)
286 return clean_zn_cnt; 287 /*
288 * Due to the way UBIFS updates the clean znode counter it may
289 * temporarily be negative.
290 */
291 return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
287 292
288 if (!clean_zn_cnt) { 293 if (!clean_zn_cnt) {
289 /* 294 /*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f74..b5aeb5a8ebed 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
382 end_writeback(inode); 382 end_writeback(inode);
383} 383}
384 384
385static void ubifs_dirty_inode(struct inode *inode) 385static void ubifs_dirty_inode(struct inode *inode, int flags)
386{ 386{
387 struct ubifs_inode *ui = ubifs_inode(inode); 387 struct ubifs_inode *ui = ubifs_inode(inode);
388 388
@@ -811,15 +811,18 @@ static int alloc_wbufs(struct ubifs_info *c)
811 811
812 c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; 812 c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
813 c->jheads[i].wbuf.jhead = i; 813 c->jheads[i].wbuf.jhead = i;
814 c->jheads[i].grouped = 1;
814 } 815 }
815 816
816 c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; 817 c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
817 /* 818 /*
818 * Garbage Collector head likely contains long-term data and 819 * Garbage Collector head likely contains long-term data and
819 * does not need to be synchronized by timer. 820 * does not need to be synchronized by timer. Also GC head nodes are
821 * not grouped.
820 */ 822 */
821 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; 823 c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
822 c->jheads[GCHD].wbuf.no_timer = 1; 824 c->jheads[GCHD].wbuf.no_timer = 1;
825 c->jheads[GCHD].grouped = 0;
823 826
824 return 0; 827 return 0;
825} 828}
@@ -1284,12 +1287,25 @@ static int mount_ubifs(struct ubifs_info *c)
1284 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { 1287 if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
1285 ubifs_msg("recovery needed"); 1288 ubifs_msg("recovery needed");
1286 c->need_recovery = 1; 1289 c->need_recovery = 1;
1287 if (!c->ro_mount) { 1290 }
1288 err = ubifs_recover_inl_heads(c, c->sbuf); 1291
1289 if (err) 1292 if (c->need_recovery && !c->ro_mount) {
1290 goto out_master; 1293 err = ubifs_recover_inl_heads(c, c->sbuf);
1291 } 1294 if (err)
1292 } else if (!c->ro_mount) { 1295 goto out_master;
1296 }
1297
1298 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1299 if (err)
1300 goto out_master;
1301
1302 if (!c->ro_mount && c->space_fixup) {
1303 err = ubifs_fixup_free_space(c);
1304 if (err)
1305 goto out_master;
1306 }
1307
1308 if (!c->ro_mount) {
1293 /* 1309 /*
1294 * Set the "dirty" flag so that if we reboot uncleanly we 1310 * Set the "dirty" flag so that if we reboot uncleanly we
1295 * will notice this immediately on the next mount. 1311 * will notice this immediately on the next mount.
@@ -1297,13 +1313,9 @@ static int mount_ubifs(struct ubifs_info *c)
1297 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); 1313 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
1298 err = ubifs_write_master(c); 1314 err = ubifs_write_master(c);
1299 if (err) 1315 if (err)
1300 goto out_master; 1316 goto out_lpt;
1301 } 1317 }
1302 1318
1303 err = ubifs_lpt_init(c, 1, !c->ro_mount);
1304 if (err)
1305 goto out_lpt;
1306
1307 err = dbg_check_idx_size(c, c->bi.old_idx_sz); 1319 err = dbg_check_idx_size(c, c->bi.old_idx_sz);
1308 if (err) 1320 if (err)
1309 goto out_lpt; 1321 goto out_lpt;
@@ -1396,12 +1408,6 @@ static int mount_ubifs(struct ubifs_info *c)
1396 } else 1408 } else
1397 ubifs_assert(c->lst.taken_empty_lebs > 0); 1409 ubifs_assert(c->lst.taken_empty_lebs > 0);
1398 1410
1399 if (!c->ro_mount && c->space_fixup) {
1400 err = ubifs_fixup_free_space(c);
1401 if (err)
1402 goto out_infos;
1403 }
1404
1405 err = dbg_check_filesystem(c); 1411 err = dbg_check_filesystem(c);
1406 if (err) 1412 if (err)
1407 goto out_infos; 1413 goto out_infos;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8119b1fd8d94..91b4213dde84 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2876,12 +2876,13 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
2876 */ 2876 */
2877void ubifs_tnc_close(struct ubifs_info *c) 2877void ubifs_tnc_close(struct ubifs_info *c)
2878{ 2878{
2879 long clean_freed;
2880
2881 tnc_destroy_cnext(c); 2879 tnc_destroy_cnext(c);
2882 if (c->zroot.znode) { 2880 if (c->zroot.znode) {
2883 clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode); 2881 long n;
2884 atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt); 2882
2883 ubifs_destroy_tnc_subtree(c->zroot.znode);
2884 n = atomic_long_read(&c->clean_zn_cnt);
2885 atomic_long_sub(n, &ubifs_clean_zn_cnt);
2885 } 2886 }
2886 kfree(c->gap_lebs); 2887 kfree(c->gap_lebs);
2887 kfree(c->ilebs); 2888 kfree(c->ilebs);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f0..f79983d6f860 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -722,12 +722,14 @@ struct ubifs_bud {
722 * struct ubifs_jhead - journal head. 722 * struct ubifs_jhead - journal head.
723 * @wbuf: head's write-buffer 723 * @wbuf: head's write-buffer
724 * @buds_list: list of bud LEBs belonging to this journal head 724 * @buds_list: list of bud LEBs belonging to this journal head
725 * @grouped: non-zero if UBIFS groups nodes when writing to this journal head
725 * 726 *
726 * Note, the @buds list is protected by the @c->buds_lock. 727 * Note, the @buds list is protected by the @c->buds_lock.
727 */ 728 */
728struct ubifs_jhead { 729struct ubifs_jhead {
729 struct ubifs_wbuf wbuf; 730 struct ubifs_wbuf wbuf;
730 struct list_head buds_list; 731 struct list_head buds_list;
732 unsigned int grouped:1;
731}; 733};
732 734
733/** 735/**
@@ -1614,7 +1616,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1614int ubifs_tnc_end_commit(struct ubifs_info *c); 1616int ubifs_tnc_end_commit(struct ubifs_info *c);
1615 1617
1616/* shrinker.c */ 1618/* shrinker.c */
1617int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); 1619int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
1618 1620
1619/* commit.c */ 1621/* commit.c */
1620int ubifs_bg_thread(void *info); 1622int ubifs_bg_thread(void *info);
@@ -1742,7 +1744,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
1742int ubifs_recover_master_node(struct ubifs_info *c); 1744int ubifs_recover_master_node(struct ubifs_info *c);
1743int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); 1745int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
1744struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 1746struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
1745 int offs, void *sbuf, int grouped); 1747 int offs, void *sbuf, int jhead);
1746struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, 1748struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
1747 int offs, void *sbuf); 1749 int offs, void *sbuf);
1748int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); 1750int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8f..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
783 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
784 struct kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
785 785
786 dentry_unhash(dentry);
787
788 retval = -ENOENT; 786 retval = -ENOENT;
789 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 787 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
790 if (!fi) 788 if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1083 struct kernel_lb_addr tloc; 1081 struct kernel_lb_addr tloc;
1084 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1082 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1085 1083
1086 if (new_inode && S_ISDIR(new_inode->i_mode))
1087 dentry_unhash(new_dentry);
1088
1089 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1084 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1090 if (ofi) { 1085 if (ofi) {
1091 if (ofibh.sbh != ofibh.ebh) 1086 if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf7..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
258 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
259 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
260 260
261 dentry_unhash(dentry);
262
263 lock_ufs(dir->i_sb); 261 lock_ufs(dir->i_sb);
264 if (ufs_empty_dir (inode)) { 262 if (ufs_empty_dir (inode)) {
265 err = ufs_unlink(dir, dentry); 263 err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
284 struct ufs_dir_entry *old_de; 282 struct ufs_dir_entry *old_de;
285 int err = -ENOENT; 283 int err = -ENOENT;
286 284
287 if (new_inode && S_ISDIR(new_inode->i_mode))
288 dentry_unhash(new_dentry);
289
290 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 285 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
291 if (!old_de) 286 if (!old_de)
292 goto out; 287 goto out;
diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974dea..f060663ab70c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
46 return 0; 46 return 0;
47 47
48 /* 48 /*
49 * The trusted.* namespace can only be accessed by a privileged user. 49 * The trusted.* namespace can only be accessed by privileged users.
50 */ 50 */
51 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) 51 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
52 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); 52 if (!capable(CAP_SYS_ADMIN))
53 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
54 return 0;
55 }
53 56
54 /* In user.* namespace, only regular files and directories can have 57 /*
58 * In the user.* namespace, only regular files and directories can have
55 * extended attributes. For sticky directories, only the owner and 59 * extended attributes. For sticky directories, only the owner and
56 * privileged user can write attributes. 60 * privileged users can write attributes.
57 */ 61 */
58 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { 62 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 63 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
60 return -EPERM; 64 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && 65 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
62 (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) 66 (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
63 return -EPERM; 67 return -EPERM;
@@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
87{ 91{
88 struct inode *inode = dentry->d_inode; 92 struct inode *inode = dentry->d_inode;
89 int error = -EOPNOTSUPP; 93 int error = -EOPNOTSUPP;
94 int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
95 XATTR_SECURITY_PREFIX_LEN);
90 96
97 if (issec)
98 inode->i_flags &= ~S_NOSEC;
91 if (inode->i_op->setxattr) { 99 if (inode->i_op->setxattr) {
92 error = inode->i_op->setxattr(dentry, name, value, size, flags); 100 error = inode->i_op->setxattr(dentry, name, value, size, flags);
93 if (!error) { 101 if (!error) {
@@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
95 security_inode_post_setxattr(dentry, name, value, 103 security_inode_post_setxattr(dentry, name, value,
96 size, flags); 104 size, flags);
97 } 105 }
98 } else if (!strncmp(name, XATTR_SECURITY_PREFIX, 106 } else if (issec) {
99 XATTR_SECURITY_PREFIX_LEN)) {
100 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; 107 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 error = security_inode_setsecurity(inode, suffix, value, 108 error = security_inode_setsecurity(inode, suffix, value,
102 size, flags); 109 size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf1..1e3a7ce804dc 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
925 */ 925 */
926STATIC void 926STATIC void
927xfs_fs_dirty_inode( 927xfs_fs_dirty_inode(
928 struct inode *inode) 928 struct inode *inode,
929 int flags)
929{ 930{
930 barrier(); 931 barrier();
931 XFS_I(inode)->i_update_core = 1; 932 XFS_I(inode)->i_update_core = 1;