aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-06-04 06:13:06 -0400
committerIngo Molnar <mingo@elte.hu>2011-06-04 06:13:06 -0400
commit710054ba25c0d1f8f41c22ce13ba336503fb5318 (patch)
treef9b09b722bf511841539173d946f90a20fc2e59a /fs
parent74c355fbdfedd3820046dba4f537876cea54c207 (diff)
parentb273fa9716aa1564bee88ceee62f9042981cdc81 (diff)
Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/urgent
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/affs/namei.c5
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/attr.c7
-rw-r--r--fs/autofs4/root.c2
-rw-r--r--fs/bfs/dir.c3
-rw-r--r--fs/bio.c16
-rw-r--r--fs/btrfs/ctree.h2
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/buffer.c1
-rw-r--r--fs/cifs/cifsacl.c3
-rw-r--r--fs/coda/dir.c5
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/ecryptfs/crypto.c74
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h26
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c286
-rw-r--r--fs/ecryptfs/main.c84
-rw-r--r--fs/ecryptfs/super.c16
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/ext4.h2
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/fat/namei_msdos.c5
-rw-r--r--fs/fat/namei_vfat.c5
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/hfs/dir.c6
-rw-r--r--fs/hfsplus/dir.c8
-rw-r--r--fs/hostfs/hostfs_kern.c5
-rw-r--r--fs/hpfs/namei.c5
-rw-r--r--fs/inode.c54
-rw-r--r--fs/jffs2/dir.c9
-rw-r--r--fs/jffs2/fs.c2
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/scan.c19
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/logfs/dir.c5
-rw-r--r--fs/minix/namei.c5
-rw-r--r--fs/namei.c44
-rw-r--r--fs/ncpfs/dir.c15
-rw-r--r--fs/nfs/Kconfig10
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/callback.h17
-rw-r--r--fs/nfs/callback_proc.c51
-rw-r--r--fs/nfs/callback_xdr.c96
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c14
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/inode.c11
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs4filelayout.c38
-rw-r--r--fs/nfs/nfs4filelayout.h8
-rw-r--r--fs/nfs/nfs4filelayoutdev.c119
-rw-r--r--fs/nfs/nfs4proc.c107
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c132
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/Kbuild5
-rw-r--r--fs/nfs/objlayout/objio_osd.c1057
-rw-r--r--fs/nfs/objlayout/objlayout.c712
-rw-r--r--fs/nfs/objlayout/objlayout.h187
-rw-r--r--fs/nfs/objlayout/pnfs_osd_xdr_cli.c412
-rw-r--r--fs/nfs/pagelist.c62
-rw-r--r--fs/nfs/pnfs.c342
-rw-r--r--fs/nfs/pnfs.h117
-rw-r--r--fs/nfs/pnfs_dev.c270
-rw-r--r--fs/nfs/read.c9
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/write.c10
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/nfs3proc.c2
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4proc.c73
-rw-r--r--fs/nfsd/nfs4state.c42
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/vfs.c33
-rw-r--r--fs/nfsd/vfs.h6
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/omfs/dir.c11
-rw-r--r--fs/proc/base.c9
-rw-r--r--fs/reiserfs/namei.c5
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/squashfs/export.c2
-rw-r--r--fs/squashfs/fragment.c2
-rw-r--r--fs/squashfs/id.c2
-rw-r--r--fs/squashfs/super.c6
-rw-r--r--fs/sysv/namei.c5
-rw-r--r--fs/ubifs/dir.c5
-rw-r--r--fs/ubifs/shrinker.c3
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/ubifs.h2
-rw-r--r--fs/udf/namei.c5
-rw-r--r--fs/ufs/namei.c5
-rw-r--r--fs/xattr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c3
101 files changed, 4014 insertions, 859 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d7f3e69ae29..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -814,7 +814,6 @@ int v9fs_vfs_unlink(struct inode *i, struct dentry *d)
814 814
815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) 815int v9fs_vfs_rmdir(struct inode *i, struct dentry *d)
816{ 816{
817 dentry_unhash(d);
818 return v9fs_remove(i, d, 1); 817 return v9fs_remove(i, d, 1);
819} 818}
820 819
@@ -840,9 +839,6 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
840 struct p9_fid *newdirfid; 839 struct p9_fid *newdirfid;
841 struct p9_wstat wstat; 840 struct p9_wstat wstat;
842 841
843 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
844 dentry_unhash(new_dentry);
845
846 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 842 P9_DPRINTK(P9_DEBUG_VFS, "\n");
847 retval = 0; 843 retval = 0;
848 old_inode = old_dentry->d_inode; 844 old_inode = old_dentry->d_inode;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 03330e2e390c..e3e9efc1fdd8 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -320,8 +320,6 @@ affs_rmdir(struct inode *dir, struct dentry *dentry)
320 dentry->d_inode->i_ino, 320 dentry->d_inode->i_ino,
321 (int)dentry->d_name.len, dentry->d_name.name); 321 (int)dentry->d_name.len, dentry->d_name.name);
322 322
323 dentry_unhash(dentry);
324
325 return affs_remove_header(dentry); 323 return affs_remove_header(dentry);
326} 324}
327 325
@@ -419,9 +417,6 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
419 struct buffer_head *bh = NULL; 417 struct buffer_head *bh = NULL;
420 int retval; 418 int retval;
421 419
422 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
423 dentry_unhash(new_dentry);
424
425 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n", 420 pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
426 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name, 421 (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
427 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name); 422 (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 2c4e05160042..20c106f24927 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -845,8 +845,6 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
845 _enter("{%x:%u},{%s}", 845 _enter("{%x:%u},{%s}",
846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name); 846 dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
847 847
848 dentry_unhash(dentry);
849
850 ret = -ENAMETOOLONG; 848 ret = -ENAMETOOLONG;
851 if (dentry->d_name.len >= AFSNAMEMAX) 849 if (dentry->d_name.len >= AFSNAMEMAX)
852 goto error; 850 goto error;
@@ -1148,9 +1146,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
1148 struct key *key; 1146 struct key *key;
1149 int ret; 1147 int ret;
1150 1148
1151 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1152 dentry_unhash(new_dentry);
1153
1154 vnode = AFS_FS_I(old_dentry->d_inode); 1149 vnode = AFS_FS_I(old_dentry->d_inode);
1155 orig_dvnode = AFS_FS_I(old_dir); 1150 orig_dvnode = AFS_FS_I(old_dir);
1156 new_dvnode = AFS_FS_I(new_dir); 1151 new_dvnode = AFS_FS_I(new_dir);
diff --git a/fs/attr.c b/fs/attr.c
index 91dbe2a107f2..caf2aa521e2b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -175,6 +175,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
175 return -EPERM; 175 return -EPERM;
176 } 176 }
177 177
178 if ((ia_valid & ATTR_MODE)) {
179 mode_t amode = attr->ia_mode;
180 /* Flag setting protected by i_mutex */
181 if (is_sxid(amode))
182 inode->i_flags &= ~S_NOSEC;
183 }
184
178 now = current_fs_time(inode->i_sb); 185 now = current_fs_time(inode->i_sb);
179 186
180 attr->ia_ctime = now; 187 attr->ia_ctime = now;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 87d95a8cddbc..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) 583 if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
584 return -EACCES; 584 return -EACCES;
585 585
586 dentry_unhash(dentry);
587
588 if (atomic_dec_and_test(&ino->count)) { 586 if (atomic_dec_and_test(&ino->count)) {
589 p_ino = autofs4_dentry_ino(dentry->d_parent); 587 p_ino = autofs4_dentry_ino(dentry->d_parent);
590 if (p_ino && dentry->d_parent != dentry) 588 if (p_ino && dentry->d_parent != dentry)
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index c7d1d06b0483..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -224,9 +224,6 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
224 struct bfs_sb_info *info; 224 struct bfs_sb_info *info;
225 int error = -ENOENT; 225 int error = -ENOENT;
226 226
227 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
228 dentry_unhash(new_dentry);
229
230 old_bh = new_bh = NULL; 227 old_bh = new_bh = NULL;
231 old_inode = old_dentry->d_inode; 228 old_inode = old_dentry->d_inode;
232 if (S_ISDIR(old_inode->i_mode)) 229 if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/bio.c b/fs/bio.c
index 840a0d755248..9bfade8a609b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -638,10 +638,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
638 * @offset: vec entry offset 638 * @offset: vec entry offset
639 * 639 *
640 * Attempt to add a page to the bio_vec maplist. This can fail for a 640 * Attempt to add a page to the bio_vec maplist. This can fail for a
641 * number of reasons, such as the bio being full or target block 641 * number of reasons, such as the bio being full or target block device
642 * device limitations. The target block device must allow bio's 642 * limitations. The target block device must allow bio's up to PAGE_SIZE,
643 * smaller than PAGE_SIZE, so it is always possible to add a single 643 * so it is always possible to add a single page to an empty bio.
644 * page to an empty bio. This should only be used by REQ_PC bios. 644 *
645 * This should only be used by REQ_PC bios.
645 */ 646 */
646int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, 647int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
647 unsigned int len, unsigned int offset) 648 unsigned int len, unsigned int offset)
@@ -659,10 +660,9 @@ EXPORT_SYMBOL(bio_add_pc_page);
659 * @offset: vec entry offset 660 * @offset: vec entry offset
660 * 661 *
661 * Attempt to add a page to the bio_vec maplist. This can fail for a 662 * Attempt to add a page to the bio_vec maplist. This can fail for a
662 * number of reasons, such as the bio being full or target block 663 * number of reasons, such as the bio being full or target block device
663 * device limitations. The target block device must allow bio's 664 * limitations. The target block device must allow bio's up to PAGE_SIZE,
664 * smaller than PAGE_SIZE, so it is always possible to add a single 665 * so it is always possible to add a single page to an empty bio.
665 * page to an empty bio.
666 */ 666 */
667int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 667int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
668 unsigned int offset) 668 unsigned int offset)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332323e19dd1..6c093fa98f61 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2524,7 +2524,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2524int btrfs_readpage(struct file *file, struct page *page); 2524int btrfs_readpage(struct file *file, struct page *page);
2525void btrfs_evict_inode(struct inode *inode); 2525void btrfs_evict_inode(struct inode *inode);
2526int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2526int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2527void btrfs_dirty_inode(struct inode *inode); 2527void btrfs_dirty_inode(struct inode *inode, int flags);
2528struct inode *btrfs_alloc_inode(struct super_block *sb); 2528struct inode *btrfs_alloc_inode(struct super_block *sb);
2529void btrfs_destroy_inode(struct inode *inode); 2529void btrfs_destroy_inode(struct inode *inode);
2530int btrfs_drop_inode(struct inode *inode); 2530int btrfs_drop_inode(struct inode *inode);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bb51bb1fa44f..39a9d5750efd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4294,7 +4294,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4294 * FIXME, needs more benchmarking...there are no reasons other than performance 4294 * FIXME, needs more benchmarking...there are no reasons other than performance
4295 * to keep or drop this code. 4295 * to keep or drop this code.
4296 */ 4296 */
4297void btrfs_dirty_inode(struct inode *inode) 4297void btrfs_dirty_inode(struct inode *inode, int flags)
4298{ 4298{
4299 struct btrfs_root *root = BTRFS_I(inode)->root; 4299 struct btrfs_root *root = BTRFS_I(inode)->root;
4300 struct btrfs_trans_handle *trans; 4300 struct btrfs_trans_handle *trans;
diff --git a/fs/buffer.c b/fs/buffer.c
index 698c6b2cc462..49c9aada0374 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2382,6 +2382,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2382 ret = -EAGAIN; 2382 ret = -EAGAIN;
2383 goto out_unlock; 2383 goto out_unlock;
2384 } 2384 }
2385 wait_on_page_writeback(page);
2385 return 0; 2386 return 0;
2386out_unlock: 2387out_unlock:
2387 unlock_page(page); 2388 unlock_page(page);
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 8f1700623b41..21de1d6d5849 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -74,8 +74,9 @@ shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
74 * Run idmap cache shrinker. 74 * Run idmap cache shrinker.
75 */ 75 */
76static int 76static int
77cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 77cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
78{ 78{
79 int nr_to_scan = sc->nr_to_scan;
79 int nr_del = 0; 80 int nr_del = 0;
80 int nr_rem = 0; 81 int nr_rem = 0;
81 struct rb_root *root; 82 struct rb_root *root;
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a46126fd5735..2b8dae4d121e 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -336,8 +336,6 @@ static int coda_rmdir(struct inode *dir, struct dentry *de)
336 int len = de->d_name.len; 336 int len = de->d_name.len;
337 int error; 337 int error;
338 338
339 dentry_unhash(de);
340
341 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); 339 error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len);
342 if (!error) { 340 if (!error) {
343 /* VFS may delete the child */ 341 /* VFS may delete the child */
@@ -361,9 +359,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
361 int new_length = new_dentry->d_name.len; 359 int new_length = new_dentry->d_name.len;
362 int error; 360 int error;
363 361
364 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
365 dentry_unhash(new_dentry);
366
367 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir), 362 error = venus_rename(old_dir->i_sb, coda_i2f(old_dir),
368 coda_i2f(new_dir), old_length, new_length, 363 coda_i2f(new_dir), old_length, new_length,
369 (const char *) old_name, (const char *)new_name); 364 (const char *) old_name, (const char *)new_name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 9d17d350abc5..9a37a9b6de3a 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1359,8 +1359,6 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL; 1359 struct module *subsys_owner = NULL, *dead_item_owner = NULL;
1360 int ret; 1360 int ret;
1361 1361
1362 dentry_unhash(dentry);
1363
1364 if (dentry->d_parent == configfs_sb->s_root) 1362 if (dentry->d_parent == configfs_sb->s_root)
1365 return -EPERM; 1363 return -EPERM;
1366 1364
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index b8d5c8091024..58609bde3b9f 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1024,25 +1024,25 @@ out:
1024} 1024}
1025 1025
1026/** 1026/**
1027 * contains_ecryptfs_marker - check for the ecryptfs marker 1027 * ecryptfs_validate_marker - check for the ecryptfs marker
1028 * @data: The data block in which to check 1028 * @data: The data block in which to check
1029 * 1029 *
1030 * Returns one if marker found; zero if not found 1030 * Returns zero if marker found; -EINVAL if not found
1031 */ 1031 */
1032static int contains_ecryptfs_marker(char *data) 1032static int ecryptfs_validate_marker(char *data)
1033{ 1033{
1034 u32 m_1, m_2; 1034 u32 m_1, m_2;
1035 1035
1036 m_1 = get_unaligned_be32(data); 1036 m_1 = get_unaligned_be32(data);
1037 m_2 = get_unaligned_be32(data + 4); 1037 m_2 = get_unaligned_be32(data + 4);
1038 if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2) 1038 if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
1039 return 1; 1039 return 0;
1040 ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; " 1040 ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
1041 "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2, 1041 "MAGIC_ECRYPTFS_MARKER = [0x%.8x]\n", m_1, m_2,
1042 MAGIC_ECRYPTFS_MARKER); 1042 MAGIC_ECRYPTFS_MARKER);
1043 ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = " 1043 ecryptfs_printk(KERN_DEBUG, "(m_1 ^ MAGIC_ECRYPTFS_MARKER) = "
1044 "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER)); 1044 "[0x%.8x]\n", (m_1 ^ MAGIC_ECRYPTFS_MARKER));
1045 return 0; 1045 return -EINVAL;
1046} 1046}
1047 1047
1048struct ecryptfs_flag_map_elem { 1048struct ecryptfs_flag_map_elem {
@@ -1201,27 +1201,19 @@ int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code)
1201 return rc; 1201 return rc;
1202} 1202}
1203 1203
1204int ecryptfs_read_and_validate_header_region(char *data, 1204int ecryptfs_read_and_validate_header_region(struct inode *inode)
1205 struct inode *ecryptfs_inode)
1206{ 1205{
1207 struct ecryptfs_crypt_stat *crypt_stat = 1206 u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
1208 &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); 1207 u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
1209 int rc; 1208 int rc;
1210 1209
1211 if (crypt_stat->extent_size == 0) 1210 rc = ecryptfs_read_lower(file_size, 0, ECRYPTFS_SIZE_AND_MARKER_BYTES,
1212 crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; 1211 inode);
1213 rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, 1212 if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
1214 ecryptfs_inode); 1213 return rc >= 0 ? -EINVAL : rc;
1215 if (rc < 0) { 1214 rc = ecryptfs_validate_marker(marker);
1216 printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", 1215 if (!rc)
1217 __func__, rc); 1216 ecryptfs_i_size_init(file_size, inode);
1218 goto out;
1219 }
1220 if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) {
1221 rc = -EINVAL;
1222 } else
1223 rc = 0;
1224out:
1225 return rc; 1217 return rc;
1226} 1218}
1227 1219
@@ -1242,8 +1234,7 @@ ecryptfs_write_header_metadata(char *virt,
1242 (*written) = 6; 1234 (*written) = 6;
1243} 1235}
1244 1236
1245struct kmem_cache *ecryptfs_header_cache_1; 1237struct kmem_cache *ecryptfs_header_cache;
1246struct kmem_cache *ecryptfs_header_cache_2;
1247 1238
1248/** 1239/**
1249 * ecryptfs_write_headers_virt 1240 * ecryptfs_write_headers_virt
@@ -1496,11 +1487,9 @@ static int ecryptfs_read_headers_virt(char *page_virt,
1496 crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private( 1487 crypt_stat->mount_crypt_stat = &ecryptfs_superblock_to_private(
1497 ecryptfs_dentry->d_sb)->mount_crypt_stat; 1488 ecryptfs_dentry->d_sb)->mount_crypt_stat;
1498 offset = ECRYPTFS_FILE_SIZE_BYTES; 1489 offset = ECRYPTFS_FILE_SIZE_BYTES;
1499 rc = contains_ecryptfs_marker(page_virt + offset); 1490 rc = ecryptfs_validate_marker(page_virt + offset);
1500 if (rc == 0) { 1491 if (rc)
1501 rc = -EINVAL;
1502 goto out; 1492 goto out;
1503 }
1504 if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) 1493 if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED))
1505 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); 1494 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode);
1506 offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; 1495 offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
@@ -1567,20 +1556,21 @@ out:
1567 return rc; 1556 return rc;
1568} 1557}
1569 1558
1570int ecryptfs_read_and_validate_xattr_region(char *page_virt, 1559int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
1571 struct dentry *ecryptfs_dentry) 1560 struct inode *inode)
1572{ 1561{
1562 u8 file_size[ECRYPTFS_SIZE_AND_MARKER_BYTES];
1563 u8 *marker = file_size + ECRYPTFS_FILE_SIZE_BYTES;
1573 int rc; 1564 int rc;
1574 1565
1575 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_dentry->d_inode); 1566 rc = ecryptfs_getxattr_lower(ecryptfs_dentry_to_lower(dentry),
1576 if (rc) 1567 ECRYPTFS_XATTR_NAME, file_size,
1577 goto out; 1568 ECRYPTFS_SIZE_AND_MARKER_BYTES);
1578 if (!contains_ecryptfs_marker(page_virt + ECRYPTFS_FILE_SIZE_BYTES)) { 1569 if (rc < ECRYPTFS_SIZE_AND_MARKER_BYTES)
1579 printk(KERN_WARNING "Valid data found in [%s] xattr, but " 1570 return rc >= 0 ? -EINVAL : rc;
1580 "the marker is invalid\n", ECRYPTFS_XATTR_NAME); 1571 rc = ecryptfs_validate_marker(marker);
1581 rc = -EINVAL; 1572 if (!rc)
1582 } 1573 ecryptfs_i_size_init(file_size, inode);
1583out:
1584 return rc; 1574 return rc;
1585} 1575}
1586 1576
@@ -1610,7 +1600,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1610 ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat, 1600 ecryptfs_copy_mount_wide_flags_to_inode_flags(crypt_stat,
1611 mount_crypt_stat); 1601 mount_crypt_stat);
1612 /* Read the first page from the underlying file */ 1602 /* Read the first page from the underlying file */
1613 page_virt = kmem_cache_alloc(ecryptfs_header_cache_1, GFP_USER); 1603 page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER);
1614 if (!page_virt) { 1604 if (!page_virt) {
1615 rc = -ENOMEM; 1605 rc = -ENOMEM;
1616 printk(KERN_ERR "%s: Unable to allocate page_virt\n", 1606 printk(KERN_ERR "%s: Unable to allocate page_virt\n",
@@ -1655,7 +1645,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1655out: 1645out:
1656 if (page_virt) { 1646 if (page_virt) {
1657 memset(page_virt, 0, PAGE_CACHE_SIZE); 1647 memset(page_virt, 0, PAGE_CACHE_SIZE);
1658 kmem_cache_free(ecryptfs_header_cache_1, page_virt); 1648 kmem_cache_free(ecryptfs_header_cache, page_virt);
1659 } 1649 }
1660 return rc; 1650 return rc;
1661} 1651}
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index e70282775e2c..43c7c43b06f5 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -200,6 +200,8 @@ ecryptfs_get_key_payload_data(struct key *key)
200#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5 200#define MAGIC_ECRYPTFS_MARKER 0x3c81b7f5
201#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */ 201#define MAGIC_ECRYPTFS_MARKER_SIZE_BYTES 8 /* 4*2 */
202#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64)) 202#define ECRYPTFS_FILE_SIZE_BYTES (sizeof(u64))
203#define ECRYPTFS_SIZE_AND_MARKER_BYTES (ECRYPTFS_FILE_SIZE_BYTES \
204 + MAGIC_ECRYPTFS_MARKER_SIZE_BYTES)
203#define ECRYPTFS_DEFAULT_CIPHER "aes" 205#define ECRYPTFS_DEFAULT_CIPHER "aes"
204#define ECRYPTFS_DEFAULT_KEY_BYTES 16 206#define ECRYPTFS_DEFAULT_KEY_BYTES 16
205#define ECRYPTFS_DEFAULT_HASH "md5" 207#define ECRYPTFS_DEFAULT_HASH "md5"
@@ -603,8 +605,7 @@ extern struct kmem_cache *ecryptfs_file_info_cache;
603extern struct kmem_cache *ecryptfs_dentry_info_cache; 605extern struct kmem_cache *ecryptfs_dentry_info_cache;
604extern struct kmem_cache *ecryptfs_inode_info_cache; 606extern struct kmem_cache *ecryptfs_inode_info_cache;
605extern struct kmem_cache *ecryptfs_sb_info_cache; 607extern struct kmem_cache *ecryptfs_sb_info_cache;
606extern struct kmem_cache *ecryptfs_header_cache_1; 608extern struct kmem_cache *ecryptfs_header_cache;
607extern struct kmem_cache *ecryptfs_header_cache_2;
608extern struct kmem_cache *ecryptfs_xattr_cache; 609extern struct kmem_cache *ecryptfs_xattr_cache;
609extern struct kmem_cache *ecryptfs_key_record_cache; 610extern struct kmem_cache *ecryptfs_key_record_cache;
610extern struct kmem_cache *ecryptfs_key_sig_cache; 611extern struct kmem_cache *ecryptfs_key_sig_cache;
@@ -625,14 +626,9 @@ struct ecryptfs_open_req {
625 struct list_head kthread_ctl_list; 626 struct list_head kthread_ctl_list;
626}; 627};
627 628
628#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001 629struct inode *ecryptfs_get_inode(struct inode *lower_inode,
629int ecryptfs_interpose(struct dentry *hidden_dentry, 630 struct super_block *sb);
630 struct dentry *this_dentry, struct super_block *sb,
631 u32 flags);
632void ecryptfs_i_size_init(const char *page_virt, struct inode *inode); 631void ecryptfs_i_size_init(const char *page_virt, struct inode *inode);
633int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
634 struct dentry *lower_dentry,
635 struct inode *ecryptfs_dir_inode);
636int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 632int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
637 size_t *decrypted_name_size, 633 size_t *decrypted_name_size,
638 struct dentry *ecryptfs_dentry, 634 struct dentry *ecryptfs_dentry,
@@ -664,10 +660,9 @@ int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
664void ecryptfs_write_crypt_stat_flags(char *page_virt, 660void ecryptfs_write_crypt_stat_flags(char *page_virt,
665 struct ecryptfs_crypt_stat *crypt_stat, 661 struct ecryptfs_crypt_stat *crypt_stat,
666 size_t *written); 662 size_t *written);
667int ecryptfs_read_and_validate_header_region(char *data, 663int ecryptfs_read_and_validate_header_region(struct inode *inode);
668 struct inode *ecryptfs_inode); 664int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
669int ecryptfs_read_and_validate_xattr_region(char *page_virt, 665 struct inode *inode);
670 struct dentry *ecryptfs_dentry);
671u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); 666u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes);
672int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); 667int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code);
673void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); 668void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat);
@@ -679,9 +674,6 @@ int
679ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, 674ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
680 unsigned char *src, struct dentry *ecryptfs_dentry); 675 unsigned char *src, struct dentry *ecryptfs_dentry);
681int ecryptfs_truncate(struct dentry *dentry, loff_t new_length); 676int ecryptfs_truncate(struct dentry *dentry, loff_t new_length);
682int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode);
683int ecryptfs_inode_set(struct inode *inode, void *lower_inode);
684void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode);
685ssize_t 677ssize_t
686ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, 678ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
687 void *value, size_t size); 679 void *value, size_t size);
@@ -761,7 +753,7 @@ int ecryptfs_privileged_open(struct file **lower_file,
761 struct dentry *lower_dentry, 753 struct dentry *lower_dentry,
762 struct vfsmount *lower_mnt, 754 struct vfsmount *lower_mnt,
763 const struct cred *cred); 755 const struct cred *cred);
764int ecryptfs_get_lower_file(struct dentry *ecryptfs_dentry); 756int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode);
765void ecryptfs_put_lower_file(struct inode *inode); 757void ecryptfs_put_lower_file(struct inode *inode);
766int 758int
767ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, 759ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 566e5472f78c..4ec9eb00a241 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -191,7 +191,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
191 | ECRYPTFS_ENCRYPTED); 191 | ECRYPTFS_ENCRYPTED);
192 } 192 }
193 mutex_unlock(&crypt_stat->cs_mutex); 193 mutex_unlock(&crypt_stat->cs_mutex);
194 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 194 rc = ecryptfs_get_lower_file(ecryptfs_dentry, inode);
195 if (rc) { 195 if (rc) {
196 printk(KERN_ERR "%s: Error attempting to initialize " 196 printk(KERN_ERR "%s: Error attempting to initialize "
197 "the lower file for the dentry with name " 197 "the lower file for the dentry with name "
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bc116b9ffcf2..7349ade17de6 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -51,6 +51,97 @@ static void unlock_dir(struct dentry *dir)
51 dput(dir); 51 dput(dir);
52} 52}
53 53
54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
55{
56 if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode)
57 return 1;
58 return 0;
59}
60
61static int ecryptfs_inode_set(struct inode *inode, void *opaque)
62{
63 struct inode *lower_inode = opaque;
64
65 ecryptfs_set_inode_lower(inode, lower_inode);
66 fsstack_copy_attr_all(inode, lower_inode);
67 /* i_size will be overwritten for encrypted regular files */
68 fsstack_copy_inode_size(inode, lower_inode);
69 inode->i_ino = lower_inode->i_ino;
70 inode->i_version++;
71 inode->i_mapping->a_ops = &ecryptfs_aops;
72
73 if (S_ISLNK(inode->i_mode))
74 inode->i_op = &ecryptfs_symlink_iops;
75 else if (S_ISDIR(inode->i_mode))
76 inode->i_op = &ecryptfs_dir_iops;
77 else
78 inode->i_op = &ecryptfs_main_iops;
79
80 if (S_ISDIR(inode->i_mode))
81 inode->i_fop = &ecryptfs_dir_fops;
82 else if (special_file(inode->i_mode))
83 init_special_inode(inode, inode->i_mode, inode->i_rdev);
84 else
85 inode->i_fop = &ecryptfs_main_fops;
86
87 return 0;
88}
89
90static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
91 struct super_block *sb)
92{
93 struct inode *inode;
94
95 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
96 return ERR_PTR(-EXDEV);
97 if (!igrab(lower_inode))
98 return ERR_PTR(-ESTALE);
99 inode = iget5_locked(sb, (unsigned long)lower_inode,
100 ecryptfs_inode_test, ecryptfs_inode_set,
101 lower_inode);
102 if (!inode) {
103 iput(lower_inode);
104 return ERR_PTR(-EACCES);
105 }
106 if (!(inode->i_state & I_NEW))
107 iput(lower_inode);
108
109 return inode;
110}
111
112struct inode *ecryptfs_get_inode(struct inode *lower_inode,
113 struct super_block *sb)
114{
115 struct inode *inode = __ecryptfs_get_inode(lower_inode, sb);
116
117 if (!IS_ERR(inode) && (inode->i_state & I_NEW))
118 unlock_new_inode(inode);
119
120 return inode;
121}
122
123/**
124 * ecryptfs_interpose
125 * @lower_dentry: Existing dentry in the lower filesystem
126 * @dentry: ecryptfs' dentry
127 * @sb: ecryptfs's super_block
128 *
129 * Interposes upper and lower dentries.
130 *
131 * Returns zero on success; non-zero otherwise
132 */
133static int ecryptfs_interpose(struct dentry *lower_dentry,
134 struct dentry *dentry, struct super_block *sb)
135{
136 struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb);
137
138 if (IS_ERR(inode))
139 return PTR_ERR(inode);
140 d_instantiate(dentry, inode);
141
142 return 0;
143}
144
54/** 145/**
55 * ecryptfs_create_underlying_file 146 * ecryptfs_create_underlying_file
56 * @lower_dir_inode: inode of the parent in the lower fs of the new file 147 * @lower_dir_inode: inode of the parent in the lower fs of the new file
@@ -129,7 +220,7 @@ ecryptfs_do_create(struct inode *directory_inode,
129 goto out_lock; 220 goto out_lock;
130 } 221 }
131 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, 222 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
132 directory_inode->i_sb, 0); 223 directory_inode->i_sb);
133 if (rc) { 224 if (rc) {
134 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n"); 225 ecryptfs_printk(KERN_ERR, "Failure in ecryptfs_interpose\n");
135 goto out_lock; 226 goto out_lock;
@@ -168,7 +259,8 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
168 "context; rc = [%d]\n", rc); 259 "context; rc = [%d]\n", rc);
169 goto out; 260 goto out;
170 } 261 }
171 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 262 rc = ecryptfs_get_lower_file(ecryptfs_dentry,
263 ecryptfs_dentry->d_inode);
172 if (rc) { 264 if (rc) {
173 printk(KERN_ERR "%s: Error attempting to initialize " 265 printk(KERN_ERR "%s: Error attempting to initialize "
174 "the lower file for the dentry with name " 266 "the lower file for the dentry with name "
@@ -215,102 +307,90 @@ out:
215 return rc; 307 return rc;
216} 308}
217 309
310static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
311{
312 struct ecryptfs_crypt_stat *crypt_stat;
313 int rc;
314
315 rc = ecryptfs_get_lower_file(dentry, inode);
316 if (rc) {
317 printk(KERN_ERR "%s: Error attempting to initialize "
318 "the lower file for the dentry with name "
319 "[%s]; rc = [%d]\n", __func__,
320 dentry->d_name.name, rc);
321 return rc;
322 }
323
324 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
325 /* TODO: lock for crypt_stat comparison */
326 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
327 ecryptfs_set_default_sizes(crypt_stat);
328
329 rc = ecryptfs_read_and_validate_header_region(inode);
330 ecryptfs_put_lower_file(inode);
331 if (rc) {
332 rc = ecryptfs_read_and_validate_xattr_region(dentry, inode);
333 if (!rc)
334 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
335 }
336
337 /* Must return 0 to allow non-eCryptfs files to be looked up, too */
338 return 0;
339}
340
218/** 341/**
219 * ecryptfs_lookup_and_interpose_lower - Perform a lookup 342 * ecryptfs_lookup_interpose - Dentry interposition for a lookup
220 */ 343 */
221int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 344static int ecryptfs_lookup_interpose(struct dentry *dentry,
222 struct dentry *lower_dentry, 345 struct dentry *lower_dentry,
223 struct inode *ecryptfs_dir_inode) 346 struct inode *dir_inode)
224{ 347{
225 struct dentry *lower_dir_dentry; 348 struct inode *inode, *lower_inode = lower_dentry->d_inode;
349 struct ecryptfs_dentry_info *dentry_info;
226 struct vfsmount *lower_mnt; 350 struct vfsmount *lower_mnt;
227 struct inode *lower_inode; 351 int rc = 0;
228 struct ecryptfs_crypt_stat *crypt_stat; 352
229 char *page_virt = NULL; 353 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
230 int put_lower = 0, rc = 0; 354 fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode);
231
232 lower_dir_dentry = lower_dentry->d_parent;
233 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
234 ecryptfs_dentry->d_parent));
235 lower_inode = lower_dentry->d_inode;
236 fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode);
237 BUG_ON(!lower_dentry->d_count); 355 BUG_ON(!lower_dentry->d_count);
238 ecryptfs_set_dentry_private(ecryptfs_dentry, 356
239 kmem_cache_alloc(ecryptfs_dentry_info_cache, 357 dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
240 GFP_KERNEL)); 358 ecryptfs_set_dentry_private(dentry, dentry_info);
241 if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { 359 if (!dentry_info) {
242 rc = -ENOMEM;
243 printk(KERN_ERR "%s: Out of memory whilst attempting " 360 printk(KERN_ERR "%s: Out of memory whilst attempting "
244 "to allocate ecryptfs_dentry_info struct\n", 361 "to allocate ecryptfs_dentry_info struct\n",
245 __func__); 362 __func__);
246 goto out_put; 363 dput(lower_dentry);
364 mntput(lower_mnt);
365 d_drop(dentry);
366 return -ENOMEM;
247 } 367 }
248 ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); 368 ecryptfs_set_dentry_lower(dentry, lower_dentry);
249 ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); 369 ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt);
370
250 if (!lower_dentry->d_inode) { 371 if (!lower_dentry->d_inode) {
251 /* We want to add because we couldn't find in lower */ 372 /* We want to add because we couldn't find in lower */
252 d_add(ecryptfs_dentry, NULL); 373 d_add(dentry, NULL);
253 goto out; 374 return 0;
254 }
255 rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry,
256 ecryptfs_dir_inode->i_sb,
257 ECRYPTFS_INTERPOSE_FLAG_D_ADD);
258 if (rc) {
259 printk(KERN_ERR "%s: Error interposing; rc = [%d]\n",
260 __func__, rc);
261 goto out;
262 }
263 if (S_ISDIR(lower_inode->i_mode))
264 goto out;
265 if (S_ISLNK(lower_inode->i_mode))
266 goto out;
267 if (special_file(lower_inode->i_mode))
268 goto out;
269 /* Released in this function */
270 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
271 if (!page_virt) {
272 printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n",
273 __func__);
274 rc = -ENOMEM;
275 goto out;
276 } 375 }
277 rc = ecryptfs_get_lower_file(ecryptfs_dentry); 376 inode = __ecryptfs_get_inode(lower_inode, dir_inode->i_sb);
278 if (rc) { 377 if (IS_ERR(inode)) {
279 printk(KERN_ERR "%s: Error attempting to initialize " 378 printk(KERN_ERR "%s: Error interposing; rc = [%ld]\n",
280 "the lower file for the dentry with name " 379 __func__, PTR_ERR(inode));
281 "[%s]; rc = [%d]\n", __func__, 380 return PTR_ERR(inode);
282 ecryptfs_dentry->d_name.name, rc);
283 goto out_free_kmem;
284 } 381 }
285 put_lower = 1; 382 if (S_ISREG(inode->i_mode)) {
286 crypt_stat = &ecryptfs_inode_to_private( 383 rc = ecryptfs_i_size_read(dentry, inode);
287 ecryptfs_dentry->d_inode)->crypt_stat;
288 /* TODO: lock for crypt_stat comparison */
289 if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
290 ecryptfs_set_default_sizes(crypt_stat);
291 rc = ecryptfs_read_and_validate_header_region(page_virt,
292 ecryptfs_dentry->d_inode);
293 if (rc) {
294 memset(page_virt, 0, PAGE_CACHE_SIZE);
295 rc = ecryptfs_read_and_validate_xattr_region(page_virt,
296 ecryptfs_dentry);
297 if (rc) { 384 if (rc) {
298 rc = 0; 385 make_bad_inode(inode);
299 goto out_free_kmem; 386 return rc;
300 } 387 }
301 crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
302 } 388 }
303 ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); 389
304out_free_kmem: 390 if (inode->i_state & I_NEW)
305 kmem_cache_free(ecryptfs_header_cache_2, page_virt); 391 unlock_new_inode(inode);
306 goto out; 392 d_add(dentry, inode);
307out_put: 393
308 dput(lower_dentry);
309 mntput(lower_mnt);
310 d_drop(ecryptfs_dentry);
311out:
312 if (put_lower)
313 ecryptfs_put_lower_file(ecryptfs_dentry->d_inode);
314 return rc; 394 return rc;
315} 395}
316 396
@@ -353,12 +433,12 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
353 goto out_d_drop; 433 goto out_d_drop;
354 } 434 }
355 if (lower_dentry->d_inode) 435 if (lower_dentry->d_inode)
356 goto lookup_and_interpose; 436 goto interpose;
357 mount_crypt_stat = &ecryptfs_superblock_to_private( 437 mount_crypt_stat = &ecryptfs_superblock_to_private(
358 ecryptfs_dentry->d_sb)->mount_crypt_stat; 438 ecryptfs_dentry->d_sb)->mount_crypt_stat;
359 if (!(mount_crypt_stat 439 if (!(mount_crypt_stat
360 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) 440 && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
361 goto lookup_and_interpose; 441 goto interpose;
362 dput(lower_dentry); 442 dput(lower_dentry);
363 rc = ecryptfs_encrypt_and_encode_filename( 443 rc = ecryptfs_encrypt_and_encode_filename(
364 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, 444 &encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
@@ -381,9 +461,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
381 encrypted_and_encoded_name); 461 encrypted_and_encoded_name);
382 goto out_d_drop; 462 goto out_d_drop;
383 } 463 }
384lookup_and_interpose: 464interpose:
385 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 465 rc = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry,
386 ecryptfs_dir_inode); 466 ecryptfs_dir_inode);
387 goto out; 467 goto out;
388out_d_drop: 468out_d_drop:
389 d_drop(ecryptfs_dentry); 469 d_drop(ecryptfs_dentry);
@@ -411,7 +491,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
411 lower_new_dentry); 491 lower_new_dentry);
412 if (rc || !lower_new_dentry->d_inode) 492 if (rc || !lower_new_dentry->d_inode)
413 goto out_lock; 493 goto out_lock;
414 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); 494 rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
415 if (rc) 495 if (rc)
416 goto out_lock; 496 goto out_lock;
417 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 497 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -478,7 +558,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
478 kfree(encoded_symname); 558 kfree(encoded_symname);
479 if (rc || !lower_dentry->d_inode) 559 if (rc || !lower_dentry->d_inode)
480 goto out_lock; 560 goto out_lock;
481 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 561 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
482 if (rc) 562 if (rc)
483 goto out_lock; 563 goto out_lock;
484 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 564 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -502,7 +582,7 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
502 rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); 582 rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode);
503 if (rc || !lower_dentry->d_inode) 583 if (rc || !lower_dentry->d_inode)
504 goto out; 584 goto out;
505 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 585 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
506 if (rc) 586 if (rc)
507 goto out; 587 goto out;
508 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 588 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -521,8 +601,6 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
521 struct dentry *lower_dir_dentry; 601 struct dentry *lower_dir_dentry;
522 int rc; 602 int rc;
523 603
524 dentry_unhash(dentry);
525
526 lower_dentry = ecryptfs_dentry_to_lower(dentry); 604 lower_dentry = ecryptfs_dentry_to_lower(dentry);
527 dget(dentry); 605 dget(dentry);
528 lower_dir_dentry = lock_parent(lower_dentry); 606 lower_dir_dentry = lock_parent(lower_dentry);
@@ -552,7 +630,7 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
552 rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); 630 rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev);
553 if (rc || !lower_dentry->d_inode) 631 if (rc || !lower_dentry->d_inode)
554 goto out; 632 goto out;
555 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 0); 633 rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
556 if (rc) 634 if (rc)
557 goto out; 635 goto out;
558 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); 636 fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
@@ -575,9 +653,6 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
575 struct dentry *lower_new_dir_dentry; 653 struct dentry *lower_new_dir_dentry;
576 struct dentry *trap = NULL; 654 struct dentry *trap = NULL;
577 655
578 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
579 dentry_unhash(new_dentry);
580
581 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); 656 lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
582 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); 657 lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
583 dget(lower_old_dentry); 658 dget(lower_old_dentry);
@@ -755,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
755 lower_ia->ia_valid &= ~ATTR_SIZE; 830 lower_ia->ia_valid &= ~ATTR_SIZE;
756 return 0; 831 return 0;
757 } 832 }
758 rc = ecryptfs_get_lower_file(dentry); 833 rc = ecryptfs_get_lower_file(dentry, inode);
759 if (rc) 834 if (rc)
760 return rc; 835 return rc;
761 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 836 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
@@ -911,7 +986,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
911 986
912 mount_crypt_stat = &ecryptfs_superblock_to_private( 987 mount_crypt_stat = &ecryptfs_superblock_to_private(
913 dentry->d_sb)->mount_crypt_stat; 988 dentry->d_sb)->mount_crypt_stat;
914 rc = ecryptfs_get_lower_file(dentry); 989 rc = ecryptfs_get_lower_file(dentry, inode);
915 if (rc) { 990 if (rc) {
916 mutex_unlock(&crypt_stat->cs_mutex); 991 mutex_unlock(&crypt_stat->cs_mutex);
917 goto out; 992 goto out;
@@ -1084,21 +1159,6 @@ out:
1084 return rc; 1159 return rc;
1085} 1160}
1086 1161
1087int ecryptfs_inode_test(struct inode *inode, void *candidate_lower_inode)
1088{
1089 if ((ecryptfs_inode_to_lower(inode)
1090 == (struct inode *)candidate_lower_inode))
1091 return 1;
1092 else
1093 return 0;
1094}
1095
1096int ecryptfs_inode_set(struct inode *inode, void *lower_inode)
1097{
1098 ecryptfs_init_inode(inode, (struct inode *)lower_inode);
1099 return 0;
1100}
1101
1102const struct inode_operations ecryptfs_symlink_iops = { 1162const struct inode_operations ecryptfs_symlink_iops = {
1103 .readlink = ecryptfs_readlink, 1163 .readlink = ecryptfs_readlink,
1104 .follow_link = ecryptfs_follow_link, 1164 .follow_link = ecryptfs_follow_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 89b93389af8e..9f1bb747d77d 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -135,12 +135,12 @@ static int ecryptfs_init_lower_file(struct dentry *dentry,
135 return rc; 135 return rc;
136} 136}
137 137
138int ecryptfs_get_lower_file(struct dentry *dentry) 138int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode)
139{ 139{
140 struct ecryptfs_inode_info *inode_info = 140 struct ecryptfs_inode_info *inode_info;
141 ecryptfs_inode_to_private(dentry->d_inode);
142 int count, rc = 0; 141 int count, rc = 0;
143 142
143 inode_info = ecryptfs_inode_to_private(inode);
144 mutex_lock(&inode_info->lower_file_mutex); 144 mutex_lock(&inode_info->lower_file_mutex);
145 count = atomic_inc_return(&inode_info->lower_file_count); 145 count = atomic_inc_return(&inode_info->lower_file_count);
146 if (WARN_ON_ONCE(count < 1)) 146 if (WARN_ON_ONCE(count < 1))
@@ -168,75 +168,6 @@ void ecryptfs_put_lower_file(struct inode *inode)
168 } 168 }
169} 169}
170 170
171static struct inode *ecryptfs_get_inode(struct inode *lower_inode,
172 struct super_block *sb)
173{
174 struct inode *inode;
175 int rc = 0;
176
177 if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb)) {
178 rc = -EXDEV;
179 goto out;
180 }
181 if (!igrab(lower_inode)) {
182 rc = -ESTALE;
183 goto out;
184 }
185 inode = iget5_locked(sb, (unsigned long)lower_inode,
186 ecryptfs_inode_test, ecryptfs_inode_set,
187 lower_inode);
188 if (!inode) {
189 rc = -EACCES;
190 iput(lower_inode);
191 goto out;
192 }
193 if (inode->i_state & I_NEW)
194 unlock_new_inode(inode);
195 else
196 iput(lower_inode);
197 if (S_ISLNK(lower_inode->i_mode))
198 inode->i_op = &ecryptfs_symlink_iops;
199 else if (S_ISDIR(lower_inode->i_mode))
200 inode->i_op = &ecryptfs_dir_iops;
201 if (S_ISDIR(lower_inode->i_mode))
202 inode->i_fop = &ecryptfs_dir_fops;
203 if (special_file(lower_inode->i_mode))
204 init_special_inode(inode, lower_inode->i_mode,
205 lower_inode->i_rdev);
206 fsstack_copy_attr_all(inode, lower_inode);
207 /* This size will be overwritten for real files w/ headers and
208 * other metadata */
209 fsstack_copy_inode_size(inode, lower_inode);
210 return inode;
211out:
212 return ERR_PTR(rc);
213}
214
215/**
216 * ecryptfs_interpose
217 * @lower_dentry: Existing dentry in the lower filesystem
218 * @dentry: ecryptfs' dentry
219 * @sb: ecryptfs's super_block
220 * @flags: flags to govern behavior of interpose procedure
221 *
222 * Interposes upper and lower dentries.
223 *
224 * Returns zero on success; non-zero otherwise
225 */
226int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
227 struct super_block *sb, u32 flags)
228{
229 struct inode *lower_inode = lower_dentry->d_inode;
230 struct inode *inode = ecryptfs_get_inode(lower_inode, sb);
231 if (IS_ERR(inode))
232 return PTR_ERR(inode);
233 if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
234 d_add(dentry, inode);
235 else
236 d_instantiate(dentry, inode);
237 return 0;
238}
239
240enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, 171enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
241 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, 172 ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher,
242 ecryptfs_opt_ecryptfs_key_bytes, 173 ecryptfs_opt_ecryptfs_key_bytes,
@@ -704,13 +635,8 @@ static struct ecryptfs_cache_info {
704 .size = sizeof(struct ecryptfs_sb_info), 635 .size = sizeof(struct ecryptfs_sb_info),
705 }, 636 },
706 { 637 {
707 .cache = &ecryptfs_header_cache_1, 638 .cache = &ecryptfs_header_cache,
708 .name = "ecryptfs_headers_1", 639 .name = "ecryptfs_headers",
709 .size = PAGE_CACHE_SIZE,
710 },
711 {
712 .cache = &ecryptfs_header_cache_2,
713 .name = "ecryptfs_headers_2",
714 .size = PAGE_CACHE_SIZE, 640 .size = PAGE_CACHE_SIZE,
715 }, 641 },
716 { 642 {
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 245b517bf1b6..dbd52d40df4c 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -93,22 +93,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
93} 93}
94 94
95/** 95/**
96 * ecryptfs_init_inode
97 * @inode: The ecryptfs inode
98 *
99 * Set up the ecryptfs inode.
100 */
101void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
102{
103 ecryptfs_set_inode_lower(inode, lower_inode);
104 inode->i_ino = lower_inode->i_ino;
105 inode->i_version++;
106 inode->i_op = &ecryptfs_main_iops;
107 inode->i_fop = &ecryptfs_main_fops;
108 inode->i_mapping->a_ops = &ecryptfs_aops;
109}
110
111/**
112 * ecryptfs_statfs 96 * ecryptfs_statfs
113 * @sb: The ecryptfs super block 97 * @sb: The ecryptfs super block
114 * @buf: The struct kstatfs to fill in with stats 98 * @buf: The struct kstatfs to fill in with stats
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 68b2e43d7c35..3451d23c3bae 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -3392,7 +3392,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3392 * so would cause a commit on atime updates, which we don't bother doing. 3392 * so would cause a commit on atime updates, which we don't bother doing.
3393 * We handle synchronous inodes at the highest possible level. 3393 * We handle synchronous inodes at the highest possible level.
3394 */ 3394 */
3395void ext3_dirty_inode(struct inode *inode) 3395void ext3_dirty_inode(struct inode *inode, int flags)
3396{ 3396{
3397 handle_t *current_handle = ext3_journal_current_handle(); 3397 handle_t *current_handle = ext3_journal_current_handle();
3398 handle_t *handle; 3398 handle_t *handle;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a74b89c09f90..1921392cd708 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1813,7 +1813,7 @@ extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1813extern void ext4_evict_inode(struct inode *); 1813extern void ext4_evict_inode(struct inode *);
1814extern void ext4_clear_inode(struct inode *); 1814extern void ext4_clear_inode(struct inode *);
1815extern int ext4_sync_inode(handle_t *, struct inode *); 1815extern int ext4_sync_inode(handle_t *, struct inode *);
1816extern void ext4_dirty_inode(struct inode *); 1816extern void ext4_dirty_inode(struct inode *, int);
1817extern int ext4_change_inode_journal_flag(struct inode *, int); 1817extern int ext4_change_inode_journal_flag(struct inode *, int);
1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1818extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1819extern int ext4_can_truncate(struct inode *inode); 1819extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 50d0e9c64584..a5763e3505ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5733,7 +5733,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
5733 * so would cause a commit on atime updates, which we don't bother doing. 5733 * so would cause a commit on atime updates, which we don't bother doing.
5734 * We handle synchronous inodes at the highest possible level. 5734 * We handle synchronous inodes at the highest possible level.
5735 */ 5735 */
5736void ext4_dirty_inode(struct inode *inode) 5736void ext4_dirty_inode(struct inode *inode, int flags)
5737{ 5737{
5738 handle_t *handle; 5738 handle_t *handle;
5739 5739
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index be15437c272e..3b222dafd15b 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -326,8 +326,6 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
326 struct fat_slot_info sinfo; 326 struct fat_slot_info sinfo;
327 int err; 327 int err;
328 328
329 dentry_unhash(dentry);
330
331 lock_super(sb); 329 lock_super(sb);
332 /* 330 /*
333 * Check whether the directory is not in use, then check 331 * Check whether the directory is not in use, then check
@@ -459,9 +457,6 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
459 old_inode = old_dentry->d_inode; 457 old_inode = old_dentry->d_inode;
460 new_inode = new_dentry->d_inode; 458 new_inode = new_dentry->d_inode;
461 459
462 if (new_inode && S_ISDIR(new_inode->i_mode))
463 dentry_unhash(new_dentry);
464
465 err = fat_scan(old_dir, old_name, &old_sinfo); 460 err = fat_scan(old_dir, old_name, &old_sinfo);
466 if (err) { 461 if (err) {
467 err = -EIO; 462 err = -EIO;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index c61a6789f36c..20b4ea53fdc4 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -824,8 +824,6 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
824 struct fat_slot_info sinfo; 824 struct fat_slot_info sinfo;
825 int err; 825 int err;
826 826
827 dentry_unhash(dentry);
828
829 lock_super(sb); 827 lock_super(sb);
830 828
831 err = fat_dir_empty(inode); 829 err = fat_dir_empty(inode);
@@ -933,9 +931,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
933 int err, is_dir, update_dotdot, corrupt = 0; 931 int err, is_dir, update_dotdot, corrupt = 0;
934 struct super_block *sb = old_dir->i_sb; 932 struct super_block *sb = old_dir->i_sb;
935 933
936 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
937 dentry_unhash(new_dentry);
938
939 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; 934 old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
940 old_inode = old_dentry->d_inode; 935 old_inode = old_dentry->d_inode;
941 new_inode = new_dentry->d_inode; 936 new_inode = new_dentry->d_inode;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 34591ee804b5..0f015a0468de 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1007,9 +1007,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
1007 * In short, make sure you hash any inodes _before_ you start marking 1007 * In short, make sure you hash any inodes _before_ you start marking
1008 * them dirty. 1008 * them dirty.
1009 * 1009 *
1010 * This function *must* be atomic for the I_DIRTY_PAGES case -
1011 * set_page_dirty() is called under spinlock in several places.
1012 *
1013 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of 1010 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
1014 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of 1011 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
1015 * the kernel-internal blockdev inode represents the dirtying time of the 1012 * the kernel-internal blockdev inode represents the dirtying time of the
@@ -1028,7 +1025,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 */ 1025 */
1029 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 1026 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
1030 if (sb->s_op->dirty_inode) 1027 if (sb->s_op->dirty_inode)
1031 sb->s_op->dirty_inode(inode); 1028 sb->s_op->dirty_inode(inode, flags);
1032 } 1029 }
1033 1030
1034 /* 1031 /*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0d0e3faddcfa..d50160714595 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -667,8 +667,6 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
667 if (IS_ERR(req)) 667 if (IS_ERR(req))
668 return PTR_ERR(req); 668 return PTR_ERR(req);
669 669
670 dentry_unhash(entry);
671
672 req->in.h.opcode = FUSE_RMDIR; 670 req->in.h.opcode = FUSE_RMDIR;
673 req->in.h.nodeid = get_node_id(dir); 671 req->in.h.nodeid = get_node_id(dir);
674 req->in.numargs = 1; 672 req->in.numargs = 1;
@@ -694,9 +692,6 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
694 struct fuse_conn *fc = get_fuse_conn(olddir); 692 struct fuse_conn *fc = get_fuse_conn(olddir);
695 struct fuse_req *req = fuse_get_req(fc); 693 struct fuse_req *req = fuse_get_req(fc);
696 694
697 if (newent->d_inode && S_ISDIR(newent->d_inode->i_mode))
698 dentry_unhash(newent);
699
700 if (IS_ERR(req)) 695 if (IS_ERR(req))
701 return PTR_ERR(req); 696 return PTR_ERR(req);
702 697
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 1cb70cdba2c1..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -253,9 +253,6 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry)
253 struct inode *inode = dentry->d_inode; 253 struct inode *inode = dentry->d_inode;
254 int res; 254 int res;
255 255
256 if (S_ISDIR(inode->i_mode))
257 dentry_unhash(dentry);
258
259 if (S_ISDIR(inode->i_mode) && inode->i_size != 2) 256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
260 return -ENOTEMPTY; 257 return -ENOTEMPTY;
261 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
@@ -286,9 +283,6 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
286 283
287 /* Unlink destination if it already exists */ 284 /* Unlink destination if it already exists */
288 if (new_dentry->d_inode) { 285 if (new_dentry->d_inode) {
289 if (S_ISDIR(new_dentry->d_inode->i_mode))
290 dentry_unhash(new_dentry);
291
292 res = hfs_remove(new_dir, new_dentry); 286 res = hfs_remove(new_dir, new_dentry);
293 if (res) 287 if (res)
294 return res; 288 return res;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index b28835091dd0..4df5059c25da 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -370,8 +370,6 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
370 struct inode *inode = dentry->d_inode; 370 struct inode *inode = dentry->d_inode;
371 int res; 371 int res;
372 372
373 dentry_unhash(dentry);
374
375 if (inode->i_size != 2) 373 if (inode->i_size != 2)
376 return -ENOTEMPTY; 374 return -ENOTEMPTY;
377 375
@@ -469,12 +467,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
469 467
470 /* Unlink destination if it already exists */ 468 /* Unlink destination if it already exists */
471 if (new_dentry->d_inode) { 469 if (new_dentry->d_inode) {
472 if (S_ISDIR(new_dentry->d_inode->i_mode)) { 470 if (S_ISDIR(new_dentry->d_inode->i_mode))
473 dentry_unhash(new_dentry);
474 res = hfsplus_rmdir(new_dir, new_dentry); 471 res = hfsplus_rmdir(new_dir, new_dentry);
475 } else { 472 else
476 res = hfsplus_unlink(new_dir, new_dentry); 473 res = hfsplus_unlink(new_dir, new_dentry);
477 }
478 if (res) 474 if (res)
479 return res; 475 return res;
480 } 476 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6816b9e6903..2638c834ed28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -683,8 +683,6 @@ int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
683 char *file; 683 char *file;
684 int err; 684 int err;
685 685
686 dentry_unhash(dentry);
687
688 if ((file = dentry_name(dentry)) == NULL) 686 if ((file = dentry_name(dentry)) == NULL)
689 return -ENOMEM; 687 return -ENOMEM;
690 err = do_rmdir(file); 688 err = do_rmdir(file);
@@ -738,9 +736,6 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
738 char *from_name, *to_name; 736 char *from_name, *to_name;
739 int err; 737 int err;
740 738
741 if (to->d_inode && S_ISDIR(to->d_inode->i_mode))
742 dentry_unhash(to);
743
744 if ((from_name = dentry_name(from)) == NULL) 739 if ((from_name = dentry_name(from)) == NULL)
745 return -ENOMEM; 740 return -ENOMEM;
746 if ((to_name = dentry_name(to)) == NULL) { 741 if ((to_name = dentry_name(to)) == NULL) {
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ff0ce21c0867..acf95dab2aac 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -439,8 +439,6 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
439 int err; 439 int err;
440 int r; 440 int r;
441 441
442 dentry_unhash(dentry);
443
444 hpfs_adjust_length(name, &len); 442 hpfs_adjust_length(name, &len);
445 hpfs_lock(dir->i_sb); 443 hpfs_lock(dir->i_sb);
446 err = -ENOENT; 444 err = -ENOENT;
@@ -535,9 +533,6 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
535 struct fnode *fnode; 533 struct fnode *fnode;
536 int err; 534 int err;
537 535
538 if (new_inode && S_ISDIR(new_inode->i_mode))
539 dentry_unhash(new_dentry);
540
541 if ((err = hpfs_chk_name(new_name, &new_len))) return err; 536 if ((err = hpfs_chk_name(new_name, &new_len))) return err;
542 err = 0; 537 err = 0;
543 hpfs_adjust_length(old_name, &old_len); 538 hpfs_adjust_length(old_name, &old_len);
diff --git a/fs/inode.c b/fs/inode.c
index 990d284877a1..0f7e88a7803f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1,9 +1,7 @@
1/* 1/*
2 * linux/fs/inode.c
3 *
4 * (C) 1997 Linus Torvalds 2 * (C) 1997 Linus Torvalds
3 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
5 */ 4 */
6
7#include <linux/fs.h> 5#include <linux/fs.h>
8#include <linux/mm.h> 6#include <linux/mm.h>
9#include <linux/dcache.h> 7#include <linux/dcache.h>
@@ -27,10 +25,11 @@
27#include <linux/prefetch.h> 25#include <linux/prefetch.h>
28#include <linux/ima.h> 26#include <linux/ima.h>
29#include <linux/cred.h> 27#include <linux/cred.h>
28#include <linux/buffer_head.h> /* for inode_has_buffers */
30#include "internal.h" 29#include "internal.h"
31 30
32/* 31/*
33 * inode locking rules. 32 * Inode locking rules:
34 * 33 *
35 * inode->i_lock protects: 34 * inode->i_lock protects:
36 * inode->i_state, inode->i_hash, __iget() 35 * inode->i_state, inode->i_hash, __iget()
@@ -60,54 +59,11 @@
60 * inode_hash_lock 59 * inode_hash_lock
61 */ 60 */
62 61
63/*
64 * This is needed for the following functions:
65 * - inode_has_buffers
66 * - invalidate_bdev
67 *
68 * FIXME: remove all knowledge of the buffer layer from this file
69 */
70#include <linux/buffer_head.h>
71
72/*
73 * New inode.c implementation.
74 *
75 * This implementation has the basic premise of trying
76 * to be extremely low-overhead and SMP-safe, yet be
77 * simple enough to be "obviously correct".
78 *
79 * Famous last words.
80 */
81
82/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */
83
84/* #define INODE_PARANOIA 1 */
85/* #define INODE_DEBUG 1 */
86
87/*
88 * Inode lookup is no longer as critical as it used to be:
89 * most of the lookups are going to be through the dcache.
90 */
91#define I_HASHBITS i_hash_shift
92#define I_HASHMASK i_hash_mask
93
94static unsigned int i_hash_mask __read_mostly; 62static unsigned int i_hash_mask __read_mostly;
95static unsigned int i_hash_shift __read_mostly; 63static unsigned int i_hash_shift __read_mostly;
96static struct hlist_head *inode_hashtable __read_mostly; 64static struct hlist_head *inode_hashtable __read_mostly;
97static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); 65static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
98 66
99/*
100 * Each inode can be on two separate lists. One is
101 * the hash list of the inode, used for lookups. The
102 * other linked list is the "type" list:
103 * "in_use" - valid inode, i_count > 0, i_nlink > 0
104 * "dirty" - as "in_use" but also dirty
105 * "unused" - valid inode, i_count = 0
106 *
107 * A "dirty" list is maintained for each super block,
108 * allowing for low-overhead inode sync() operations.
109 */
110
111static LIST_HEAD(inode_lru); 67static LIST_HEAD(inode_lru);
112static DEFINE_SPINLOCK(inode_lru_lock); 68static DEFINE_SPINLOCK(inode_lru_lock);
113 69
@@ -424,8 +380,8 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
424 380
425 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 381 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
426 L1_CACHE_BYTES; 382 L1_CACHE_BYTES;
427 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); 383 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
428 return tmp & I_HASHMASK; 384 return tmp & i_hash_mask;
429} 385}
430 386
431/** 387/**
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 05f73328b28b..4bca6a2e5c07 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -75,7 +75,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
75 struct nameidata *nd) 75 struct nameidata *nd)
76{ 76{
77 struct jffs2_inode_info *dir_f; 77 struct jffs2_inode_info *dir_f;
78 struct jffs2_sb_info *c;
79 struct jffs2_full_dirent *fd = NULL, *fd_list; 78 struct jffs2_full_dirent *fd = NULL, *fd_list;
80 uint32_t ino = 0; 79 uint32_t ino = 0;
81 struct inode *inode = NULL; 80 struct inode *inode = NULL;
@@ -86,7 +85,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
86 return ERR_PTR(-ENAMETOOLONG); 85 return ERR_PTR(-ENAMETOOLONG);
87 86
88 dir_f = JFFS2_INODE_INFO(dir_i); 87 dir_f = JFFS2_INODE_INFO(dir_i);
89 c = JFFS2_SB_INFO(dir_i->i_sb);
90 88
91 mutex_lock(&dir_f->sem); 89 mutex_lock(&dir_f->sem);
92 90
@@ -119,7 +117,6 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
119static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) 117static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
120{ 118{
121 struct jffs2_inode_info *f; 119 struct jffs2_inode_info *f;
122 struct jffs2_sb_info *c;
123 struct inode *inode = filp->f_path.dentry->d_inode; 120 struct inode *inode = filp->f_path.dentry->d_inode;
124 struct jffs2_full_dirent *fd; 121 struct jffs2_full_dirent *fd;
125 unsigned long offset, curofs; 122 unsigned long offset, curofs;
@@ -127,7 +124,6 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir)
127 D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino)); 124 D1(printk(KERN_DEBUG "jffs2_readdir() for dir_i #%lu\n", filp->f_path.dentry->d_inode->i_ino));
128 125
129 f = JFFS2_INODE_INFO(inode); 126 f = JFFS2_INODE_INFO(inode);
130 c = JFFS2_SB_INFO(inode->i_sb);
131 127
132 offset = filp->f_pos; 128 offset = filp->f_pos;
133 129
@@ -609,8 +605,6 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
609 int ret; 605 int ret;
610 uint32_t now = get_seconds(); 606 uint32_t now = get_seconds();
611 607
612 dentry_unhash(dentry);
613
614 for (fd = f->dents ; fd; fd = fd->next) { 608 for (fd = f->dents ; fd; fd = fd->next) {
615 if (fd->ino) 609 if (fd->ino)
616 return -ENOTEMPTY; 610 return -ENOTEMPTY;
@@ -786,9 +780,6 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
786 uint8_t type; 780 uint8_t type;
787 uint32_t now; 781 uint32_t now;
788 782
789 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
790 dentry_unhash(new_dentry);
791
792 /* The VFS will check for us and prevent trying to rename a 783 /* The VFS will check for us and prevent trying to rename a
793 * file over a directory and vice versa, but if it's a directory, 784 * file over a directory and vice versa, but if it's a directory,
794 * the VFS can't check whether the victim is empty. The filesystem 785 * the VFS can't check whether the victim is empty. The filesystem
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index e896e67767eb..46ad619b6124 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -357,7 +357,7 @@ error:
357 return ERR_PTR(ret); 357 return ERR_PTR(ret);
358} 358}
359 359
360void jffs2_dirty_inode(struct inode *inode) 360void jffs2_dirty_inode(struct inode *inode, int flags)
361{ 361{
362 struct iattr iattr; 362 struct iattr iattr;
363 363
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 00bae7cc2e48..65c6c43ca482 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -172,7 +172,7 @@ int jffs2_setattr (struct dentry *, struct iattr *);
172int jffs2_do_setattr (struct inode *, struct iattr *); 172int jffs2_do_setattr (struct inode *, struct iattr *);
173struct inode *jffs2_iget(struct super_block *, unsigned long); 173struct inode *jffs2_iget(struct super_block *, unsigned long);
174void jffs2_evict_inode (struct inode *); 174void jffs2_evict_inode (struct inode *);
175void jffs2_dirty_inode(struct inode *inode); 175void jffs2_dirty_inode(struct inode *inode, int flags);
176struct inode *jffs2_new_inode (struct inode *dir_i, int mode, 176struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
177 struct jffs2_raw_inode *ri); 177 struct jffs2_raw_inode *ri);
178int jffs2_statfs (struct dentry *, struct kstatfs *); 178int jffs2_statfs (struct dentry *, struct kstatfs *);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b632dddcb482..8d8cd3419d02 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -94,7 +94,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
94 uint32_t buf_size = 0; 94 uint32_t buf_size = 0;
95 struct jffs2_summary *s = NULL; /* summary info collected by the scan process */ 95 struct jffs2_summary *s = NULL; /* summary info collected by the scan process */
96#ifndef __ECOS 96#ifndef __ECOS
97 size_t pointlen; 97 size_t pointlen, try_size;
98 98
99 if (c->mtd->point) { 99 if (c->mtd->point) {
100 ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, 100 ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen,
@@ -113,18 +113,21 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
113 /* For NAND it's quicker to read a whole eraseblock at a time, 113 /* For NAND it's quicker to read a whole eraseblock at a time,
114 apparently */ 114 apparently */
115 if (jffs2_cleanmarker_oob(c)) 115 if (jffs2_cleanmarker_oob(c))
116 buf_size = c->sector_size; 116 try_size = c->sector_size;
117 else 117 else
118 buf_size = PAGE_SIZE; 118 try_size = PAGE_SIZE;
119 119
120 /* Respect kmalloc limitations */ 120 D1(printk(KERN_DEBUG "Trying to allocate readbuf of %zu "
121 if (buf_size > 128*1024) 121 "bytes\n", try_size));
122 buf_size = 128*1024;
123 122
124 D1(printk(KERN_DEBUG "Allocating readbuf of %d bytes\n", buf_size)); 123 flashbuf = mtd_kmalloc_up_to(c->mtd, &try_size);
125 flashbuf = kmalloc(buf_size, GFP_KERNEL);
126 if (!flashbuf) 124 if (!flashbuf)
127 return -ENOMEM; 125 return -ENOMEM;
126
127 D1(printk(KERN_DEBUG "Allocated readbuf of %zu bytes\n",
128 try_size));
129
130 buf_size = (uint32_t)try_size;
128 } 131 }
129 132
130 if (jffs2_sum_active()) { 133 if (jffs2_sum_active()) {
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index eddbb373209e..109655904bbc 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -173,7 +173,7 @@ void jfs_evict_inode(struct inode *inode)
173 dquot_drop(inode); 173 dquot_drop(inode);
174} 174}
175 175
176void jfs_dirty_inode(struct inode *inode) 176void jfs_dirty_inode(struct inode *inode, int flags)
177{ 177{
178 static int noisy = 5; 178 static int noisy = 5;
179 179
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 155e91eff07d..ec2fb8b945fc 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -28,7 +28,7 @@ extern struct inode *jfs_iget(struct super_block *, unsigned long);
28extern int jfs_commit_inode(struct inode *, int); 28extern int jfs_commit_inode(struct inode *, int);
29extern int jfs_write_inode(struct inode *, struct writeback_control *); 29extern int jfs_write_inode(struct inode *, struct writeback_control *);
30extern void jfs_evict_inode(struct inode *); 30extern void jfs_evict_inode(struct inode *);
31extern void jfs_dirty_inode(struct inode *); 31extern void jfs_dirty_inode(struct inode *, int);
32extern void jfs_truncate(struct inode *); 32extern void jfs_truncate(struct inode *);
33extern void jfs_truncate_nolock(struct inode *, loff_t); 33extern void jfs_truncate_nolock(struct inode *, loff_t);
34extern void jfs_free_zero_link(struct inode *); 34extern void jfs_free_zero_link(struct inode *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 865df16a6cf3..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -360,8 +360,6 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
360 360
361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name); 361 jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
362 362
363 dentry_unhash(dentry);
364
365 /* Init inode for quota operations. */ 363 /* Init inode for quota operations. */
366 dquot_initialize(dip); 364 dquot_initialize(dip);
367 dquot_initialize(ip); 365 dquot_initialize(ip);
@@ -1097,9 +1095,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1097 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name, 1095 jfs_info("jfs_rename: %s %s", old_dentry->d_name.name,
1098 new_dentry->d_name.name); 1096 new_dentry->d_name.name);
1099 1097
1100 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1101 dentry_unhash(new_dentry);
1102
1103 dquot_initialize(old_dir); 1098 dquot_initialize(old_dir);
1104 dquot_initialize(new_dir); 1099 dquot_initialize(new_dir);
1105 1100
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f34c9cde9e94..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -273,8 +273,6 @@ static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{ 273{
274 struct inode *inode = dentry->d_inode; 274 struct inode *inode = dentry->d_inode;
275 275
276 dentry_unhash(dentry);
277
278 if (!logfs_empty_dir(inode)) 276 if (!logfs_empty_dir(inode))
279 return -ENOTEMPTY; 277 return -ENOTEMPTY;
280 278
@@ -624,9 +622,6 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
624 loff_t pos; 622 loff_t pos;
625 int err; 623 int err;
626 624
627 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
628 dentry_unhash(new_dentry);
629
630 /* 1. locate source dd */ 625 /* 1. locate source dd */
631 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); 626 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
632 if (err) 627 if (err)
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index f60aed8db9c4..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -168,8 +168,6 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry)
168 struct inode * inode = dentry->d_inode; 168 struct inode * inode = dentry->d_inode;
169 int err = -ENOTEMPTY; 169 int err = -ENOTEMPTY;
170 170
171 dentry_unhash(dentry);
172
173 if (minix_empty_dir(inode)) { 171 if (minix_empty_dir(inode)) {
174 err = minix_unlink(dir, dentry); 172 err = minix_unlink(dir, dentry);
175 if (!err) { 173 if (!err) {
@@ -192,9 +190,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
192 struct minix_dir_entry * old_de; 190 struct minix_dir_entry * old_de;
193 int err = -ENOENT; 191 int err = -ENOENT;
194 192
195 if (new_inode && S_ISDIR(new_inode->i_mode))
196 dentry_unhash(new_dentry);
197
198 old_de = minix_find_entry(old_dentry, &old_page); 193 old_de = minix_find_entry(old_dentry, &old_page);
199 if (!old_de) 194 if (!old_de)
200 goto out; 195 goto out;
diff --git a/fs/namei.c b/fs/namei.c
index 2358b326b221..e2e4e8d032ee 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -919,12 +919,11 @@ static inline bool managed_dentry_might_block(struct dentry *dentry)
919} 919}
920 920
921/* 921/*
922 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 922 * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
923 * meet a managed dentry and we're not walking to "..". True is returned to 923 * we meet a managed dentry that would need blocking.
924 * continue, false to abort.
925 */ 924 */
926static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 925static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
927 struct inode **inode, bool reverse_transit) 926 struct inode **inode)
928{ 927{
929 for (;;) { 928 for (;;) {
930 struct vfsmount *mounted; 929 struct vfsmount *mounted;
@@ -933,8 +932,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
933 * that wants to block transit. 932 * that wants to block transit.
934 */ 933 */
935 *inode = path->dentry->d_inode; 934 *inode = path->dentry->d_inode;
936 if (!reverse_transit && 935 if (unlikely(managed_dentry_might_block(path->dentry)))
937 unlikely(managed_dentry_might_block(path->dentry)))
938 return false; 936 return false;
939 937
940 if (!d_mountpoint(path->dentry)) 938 if (!d_mountpoint(path->dentry))
@@ -947,16 +945,24 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
947 path->dentry = mounted->mnt_root; 945 path->dentry = mounted->mnt_root;
948 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 946 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
949 } 947 }
950
951 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
952 return reverse_transit;
953 return true; 948 return true;
954} 949}
955 950
956static int follow_dotdot_rcu(struct nameidata *nd) 951static void follow_mount_rcu(struct nameidata *nd)
957{ 952{
958 struct inode *inode = nd->inode; 953 while (d_mountpoint(nd->path.dentry)) {
954 struct vfsmount *mounted;
955 mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
956 if (!mounted)
957 break;
958 nd->path.mnt = mounted;
959 nd->path.dentry = mounted->mnt_root;
960 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
961 }
962}
959 963
964static int follow_dotdot_rcu(struct nameidata *nd)
965{
960 set_root_rcu(nd); 966 set_root_rcu(nd);
961 967
962 while (1) { 968 while (1) {
@@ -972,7 +978,6 @@ static int follow_dotdot_rcu(struct nameidata *nd)
972 seq = read_seqcount_begin(&parent->d_seq); 978 seq = read_seqcount_begin(&parent->d_seq);
973 if (read_seqcount_retry(&old->d_seq, nd->seq)) 979 if (read_seqcount_retry(&old->d_seq, nd->seq))
974 goto failed; 980 goto failed;
975 inode = parent->d_inode;
976 nd->path.dentry = parent; 981 nd->path.dentry = parent;
977 nd->seq = seq; 982 nd->seq = seq;
978 break; 983 break;
@@ -980,10 +985,9 @@ static int follow_dotdot_rcu(struct nameidata *nd)
980 if (!follow_up_rcu(&nd->path)) 985 if (!follow_up_rcu(&nd->path))
981 break; 986 break;
982 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 987 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
983 inode = nd->path.dentry->d_inode;
984 } 988 }
985 __follow_mount_rcu(nd, &nd->path, &inode, true); 989 follow_mount_rcu(nd);
986 nd->inode = inode; 990 nd->inode = nd->path.dentry->d_inode;
987 return 0; 991 return 0;
988 992
989failed: 993failed:
@@ -1157,8 +1161,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1157 } 1161 }
1158 path->mnt = mnt; 1162 path->mnt = mnt;
1159 path->dentry = dentry; 1163 path->dentry = dentry;
1160 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1164 if (unlikely(!__follow_mount_rcu(nd, path, inode)))
1161 return 0; 1165 goto unlazy;
1166 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1167 goto unlazy;
1168 return 0;
1162unlazy: 1169unlazy:
1163 if (unlazy_walk(nd, dentry)) 1170 if (unlazy_walk(nd, dentry))
1164 return -ECHILD; 1171 return -ECHILD;
@@ -2572,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
2572 if (error) 2579 if (error)
2573 goto out; 2580 goto out;
2574 2581
2582 shrink_dcache_parent(dentry);
2575 error = dir->i_op->rmdir(dir, dentry); 2583 error = dir->i_op->rmdir(dir, dentry);
2576 if (error) 2584 if (error)
2577 goto out; 2585 goto out;
@@ -2986,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
2986 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 2994 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
2987 goto out; 2995 goto out;
2988 2996
2997 if (target)
2998 shrink_dcache_parent(new_dentry);
2989 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); 2999 error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
2990 if (error) 3000 if (error)
2991 goto out; 3001 goto out;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e3e646b06404..9c51f621e901 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -1033,8 +1033,11 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
1033 DPRINTK("ncp_rmdir: removing %s/%s\n", 1033 DPRINTK("ncp_rmdir: removing %s/%s\n",
1034 dentry->d_parent->d_name.name, dentry->d_name.name); 1034 dentry->d_parent->d_name.name, dentry->d_name.name);
1035 1035
1036 /*
1037 * fail with EBUSY if there are still references to this
1038 * directory.
1039 */
1036 dentry_unhash(dentry); 1040 dentry_unhash(dentry);
1037
1038 error = -EBUSY; 1041 error = -EBUSY;
1039 if (!d_unhashed(dentry)) 1042 if (!d_unhashed(dentry))
1040 goto out; 1043 goto out;
@@ -1141,8 +1144,16 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
1141 old_dentry->d_parent->d_name.name, old_dentry->d_name.name, 1144 old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
1142 new_dentry->d_parent->d_name.name, new_dentry->d_name.name); 1145 new_dentry->d_parent->d_name.name, new_dentry->d_name.name);
1143 1146
1144 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) 1147 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode)) {
1148 /*
1149 * fail with EBUSY if there are still references to this
1150 * directory.
1151 */
1145 dentry_unhash(new_dentry); 1152 dentry_unhash(new_dentry);
1153 error = -EBUSY;
1154 if (!d_unhashed(new_dentry))
1155 goto out;
1156 }
1146 1157
1147 ncp_age_dentry(server, old_dentry); 1158 ncp_age_dentry(server, old_dentry);
1148 ncp_age_dentry(server, new_dentry); 1159 ncp_age_dentry(server, new_dentry);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index ba306658a6db..81515545ba75 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -87,6 +87,16 @@ config NFS_V4_1
87config PNFS_FILE_LAYOUT 87config PNFS_FILE_LAYOUT
88 tristate 88 tristate
89 89
90config PNFS_OBJLAYOUT
91 tristate "Provide support for the pNFS Objects Layout Driver for NFSv4.1 pNFS (EXPERIMENTAL)"
92 depends on NFS_FS && NFS_V4_1 && SCSI_OSD_ULD
93 help
94 Say M here if you want your pNFS client to support the Objects Layout Driver.
95 Requires the SCSI osd initiator library (SCSI_OSD_INITIATOR) and
96 upper level driver (SCSI_OSD_ULD).
97
98 If unsure, say N.
99
90config ROOT_NFS 100config ROOT_NFS
91 bool "Root file system on NFS" 101 bool "Root file system on NFS"
92 depends on NFS_FS=y && IP_PNP 102 depends on NFS_FS=y && IP_PNP
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4776ff9e3814..6a34f7dd0e6f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -15,9 +15,11 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
15 delegation.o idmap.o \ 15 delegation.o idmap.o \
16 callback.o callback_xdr.o callback_proc.o \ 16 callback.o callback_xdr.o callback_proc.o \
17 nfs4namespace.o 17 nfs4namespace.o
18nfs-$(CONFIG_NFS_V4_1) += pnfs.o 18nfs-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
19nfs-$(CONFIG_SYSCTL) += sysctl.o 19nfs-$(CONFIG_SYSCTL) += sysctl.o
20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o 20nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o
21 21
22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o 22obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o 23nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
24
25obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 46d93ce7311b..b257383bb565 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -167,6 +167,23 @@ extern unsigned nfs4_callback_layoutrecall(
167 167
168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses); 168extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
169extern void nfs4_cb_take_slot(struct nfs_client *clp); 169extern void nfs4_cb_take_slot(struct nfs_client *clp);
170
171struct cb_devicenotifyitem {
172 uint32_t cbd_notify_type;
173 uint32_t cbd_layout_type;
174 struct nfs4_deviceid cbd_dev_id;
175 uint32_t cbd_immediate;
176};
177
178struct cb_devicenotifyargs {
179 int ndevs;
180 struct cb_devicenotifyitem *devs;
181};
182
183extern __be32 nfs4_callback_devicenotify(
184 struct cb_devicenotifyargs *args,
185 void *dummy, struct cb_process_state *cps);
186
170#endif /* CONFIG_NFS_V4_1 */ 187#endif /* CONFIG_NFS_V4_1 */
171extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *); 188extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
172extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args, 189extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 2f41dccea18e..d4d1954e9bb9 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -139,7 +139,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
139 spin_lock(&ino->i_lock); 139 spin_lock(&ino->i_lock);
140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 140 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
141 mark_matching_lsegs_invalid(lo, &free_me_list, 141 mark_matching_lsegs_invalid(lo, &free_me_list,
142 args->cbl_range.iomode)) 142 &args->cbl_range))
143 rv = NFS4ERR_DELAY; 143 rv = NFS4ERR_DELAY;
144 else 144 else
145 rv = NFS4ERR_NOMATCHING_LAYOUT; 145 rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -184,7 +184,7 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
184 ino = lo->plh_inode; 184 ino = lo->plh_inode;
185 spin_lock(&ino->i_lock); 185 spin_lock(&ino->i_lock);
186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); 186 set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
187 if (mark_matching_lsegs_invalid(lo, &free_me_list, range.iomode)) 187 if (mark_matching_lsegs_invalid(lo, &free_me_list, &range))
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
@@ -241,6 +241,53 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
241 do_callback_layoutrecall(clp, &args); 241 do_callback_layoutrecall(clp, &args);
242} 242}
243 243
244__be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
245 void *dummy, struct cb_process_state *cps)
246{
247 int i;
248 __be32 res = 0;
249 struct nfs_client *clp = cps->clp;
250 struct nfs_server *server = NULL;
251
252 dprintk("%s: -->\n", __func__);
253
254 if (!clp) {
255 res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
256 goto out;
257 }
258
259 for (i = 0; i < args->ndevs; i++) {
260 struct cb_devicenotifyitem *dev = &args->devs[i];
261
262 if (!server ||
263 server->pnfs_curr_ld->id != dev->cbd_layout_type) {
264 rcu_read_lock();
265 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
266 if (server->pnfs_curr_ld &&
267 server->pnfs_curr_ld->id == dev->cbd_layout_type) {
268 rcu_read_unlock();
269 goto found;
270 }
271 rcu_read_unlock();
272 dprintk("%s: layout type %u not found\n",
273 __func__, dev->cbd_layout_type);
274 continue;
275 }
276
277 found:
278 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
279 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
280 "deleting instead\n", __func__);
281 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
282 }
283
284out:
285 kfree(args->devs);
286 dprintk("%s: exit with status = %u\n",
287 __func__, be32_to_cpu(res));
288 return res;
289}
290
244int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid) 291int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation, const nfs4_stateid *stateid)
245{ 292{
246 if (delegation == NULL) 293 if (delegation == NULL)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 00ecf62ce7c1..c6c86a77e043 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -25,6 +25,7 @@
25 25
26#if defined(CONFIG_NFS_V4_1) 26#if defined(CONFIG_NFS_V4_1)
27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 27#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
28#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \ 29#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
29 4 + 1 + 3) 30 4 + 1 + 3)
30#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ) 31#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
@@ -284,6 +285,93 @@ out:
284 return status; 285 return status;
285} 286}
286 287
288static
289__be32 decode_devicenotify_args(struct svc_rqst *rqstp,
290 struct xdr_stream *xdr,
291 struct cb_devicenotifyargs *args)
292{
293 __be32 *p;
294 __be32 status = 0;
295 u32 tmp;
296 int n, i;
297 args->ndevs = 0;
298
299 /* Num of device notifications */
300 p = read_buf(xdr, sizeof(uint32_t));
301 if (unlikely(p == NULL)) {
302 status = htonl(NFS4ERR_BADXDR);
303 goto out;
304 }
305 n = ntohl(*p++);
306 if (n <= 0)
307 goto out;
308
309 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
310 if (!args->devs) {
311 status = htonl(NFS4ERR_DELAY);
312 goto out;
313 }
314
315 /* Decode each dev notification */
316 for (i = 0; i < n; i++) {
317 struct cb_devicenotifyitem *dev = &args->devs[i];
318
319 p = read_buf(xdr, (4 * sizeof(uint32_t)) + NFS4_DEVICEID4_SIZE);
320 if (unlikely(p == NULL)) {
321 status = htonl(NFS4ERR_BADXDR);
322 goto err;
323 }
324
325 tmp = ntohl(*p++); /* bitmap size */
326 if (tmp != 1) {
327 status = htonl(NFS4ERR_INVAL);
328 goto err;
329 }
330 dev->cbd_notify_type = ntohl(*p++);
331 if (dev->cbd_notify_type != NOTIFY_DEVICEID4_CHANGE &&
332 dev->cbd_notify_type != NOTIFY_DEVICEID4_DELETE) {
333 status = htonl(NFS4ERR_INVAL);
334 goto err;
335 }
336
337 tmp = ntohl(*p++); /* opaque size */
338 if (((dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) &&
339 (tmp != NFS4_DEVICEID4_SIZE + 8)) ||
340 ((dev->cbd_notify_type == NOTIFY_DEVICEID4_DELETE) &&
341 (tmp != NFS4_DEVICEID4_SIZE + 4))) {
342 status = htonl(NFS4ERR_INVAL);
343 goto err;
344 }
345 dev->cbd_layout_type = ntohl(*p++);
346 memcpy(dev->cbd_dev_id.data, p, NFS4_DEVICEID4_SIZE);
347 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
348
349 if (dev->cbd_layout_type == NOTIFY_DEVICEID4_CHANGE) {
350 p = read_buf(xdr, sizeof(uint32_t));
351 if (unlikely(p == NULL)) {
352 status = htonl(NFS4ERR_BADXDR);
353 goto err;
354 }
355 dev->cbd_immediate = ntohl(*p++);
356 } else {
357 dev->cbd_immediate = 0;
358 }
359
360 args->ndevs++;
361
362 dprintk("%s: type %d layout 0x%x immediate %d\n",
363 __func__, dev->cbd_notify_type, dev->cbd_layout_type,
364 dev->cbd_immediate);
365 }
366out:
367 dprintk("%s: status %d ndevs %d\n",
368 __func__, ntohl(status), args->ndevs);
369 return status;
370err:
371 kfree(args->devs);
372 goto out;
373}
374
287static __be32 decode_sessionid(struct xdr_stream *xdr, 375static __be32 decode_sessionid(struct xdr_stream *xdr,
288 struct nfs4_sessionid *sid) 376 struct nfs4_sessionid *sid)
289{ 377{
@@ -639,10 +727,10 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
639 case OP_CB_RECALL_ANY: 727 case OP_CB_RECALL_ANY:
640 case OP_CB_RECALL_SLOT: 728 case OP_CB_RECALL_SLOT:
641 case OP_CB_LAYOUTRECALL: 729 case OP_CB_LAYOUTRECALL:
730 case OP_CB_NOTIFY_DEVICEID:
642 *op = &callback_ops[op_nr]; 731 *op = &callback_ops[op_nr];
643 break; 732 break;
644 733
645 case OP_CB_NOTIFY_DEVICEID:
646 case OP_CB_NOTIFY: 734 case OP_CB_NOTIFY:
647 case OP_CB_PUSH_DELEG: 735 case OP_CB_PUSH_DELEG:
648 case OP_CB_RECALLABLE_OBJ_AVAIL: 736 case OP_CB_RECALLABLE_OBJ_AVAIL:
@@ -849,6 +937,12 @@ static struct callback_op callback_ops[] = {
849 (callback_decode_arg_t)decode_layoutrecall_args, 937 (callback_decode_arg_t)decode_layoutrecall_args,
850 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ, 938 .res_maxsize = CB_OP_LAYOUTRECALL_RES_MAXSZ,
851 }, 939 },
940 [OP_CB_NOTIFY_DEVICEID] = {
941 .process_op = (callback_process_op_t)nfs4_callback_devicenotify,
942 .decode_args =
943 (callback_decode_arg_t)decode_devicenotify_args,
944 .res_maxsize = CB_OP_DEVICENOTIFY_RES_MAXSZ,
945 },
852 [OP_CB_SEQUENCE] = { 946 [OP_CB_SEQUENCE] = {
853 .process_op = (callback_process_op_t)nfs4_callback_sequence, 947 .process_op = (callback_process_op_t)nfs4_callback_sequence,
854 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args, 948 .decode_args = (callback_decode_arg_t)decode_cb_sequence_args,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 139be9647d80..b3dc2b88b65b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -290,6 +290,8 @@ static void nfs_free_client(struct nfs_client *clp)
290 if (clp->cl_machine_cred != NULL) 290 if (clp->cl_machine_cred != NULL)
291 put_rpccred(clp->cl_machine_cred); 291 put_rpccred(clp->cl_machine_cred);
292 292
293 nfs4_deviceid_purge_client(clp);
294
293 kfree(clp->cl_hostname); 295 kfree(clp->cl_hostname);
294 kfree(clp); 296 kfree(clp);
295 297
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index bbbc6bf5cb2e..dd25c2aec375 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -21,25 +21,13 @@
21#include "delegation.h" 21#include "delegation.h"
22#include "internal.h" 22#include "internal.h"
23 23
24static void nfs_do_free_delegation(struct nfs_delegation *delegation)
25{
26 kfree(delegation);
27}
28
29static void nfs_free_delegation_callback(struct rcu_head *head)
30{
31 struct nfs_delegation *delegation = container_of(head, struct nfs_delegation, rcu);
32
33 nfs_do_free_delegation(delegation);
34}
35
36static void nfs_free_delegation(struct nfs_delegation *delegation) 24static void nfs_free_delegation(struct nfs_delegation *delegation)
37{ 25{
38 if (delegation->cred) { 26 if (delegation->cred) {
39 put_rpccred(delegation->cred); 27 put_rpccred(delegation->cred);
40 delegation->cred = NULL; 28 delegation->cred = NULL;
41 } 29 }
42 call_rcu(&delegation->rcu, nfs_free_delegation_callback); 30 kfree_rcu(delegation, rcu);
43} 31}
44 32
45/** 33/**
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 424e47773a84..ededdbd0db38 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -512,12 +512,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
512 struct page **xdr_pages, struct page *page, unsigned int buflen) 512 struct page **xdr_pages, struct page *page, unsigned int buflen)
513{ 513{
514 struct xdr_stream stream; 514 struct xdr_stream stream;
515 struct xdr_buf buf = { 515 struct xdr_buf buf;
516 .pages = xdr_pages,
517 .page_len = buflen,
518 .buflen = buflen,
519 .len = buflen,
520 };
521 struct page *scratch; 516 struct page *scratch;
522 struct nfs_cache_array *array; 517 struct nfs_cache_array *array;
523 unsigned int count = 0; 518 unsigned int count = 0;
@@ -527,7 +522,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
527 if (scratch == NULL) 522 if (scratch == NULL)
528 return -ENOMEM; 523 return -ENOMEM;
529 524
530 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
531 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
532 527
533 do { 528 do {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 57bb31ad7a5e..144f2a3c7185 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1298,8 +1298,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1298 i_size_write(inode, new_isize); 1298 i_size_write(inode, new_isize);
1299 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; 1299 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
1300 } 1300 }
1301 dprintk("NFS: isize change on server for file %s/%ld\n", 1301 dprintk("NFS: isize change on server for file %s/%ld "
1302 inode->i_sb->s_id, inode->i_ino); 1302 "(%Ld to %Ld)\n",
1303 inode->i_sb->s_id,
1304 inode->i_ino,
1305 (long long)cur_isize,
1306 (long long)new_isize);
1303 } 1307 }
1304 } else 1308 } else
1305 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR 1309 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
@@ -1424,9 +1428,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1424 */ 1428 */
1425void nfs4_evict_inode(struct inode *inode) 1429void nfs4_evict_inode(struct inode *inode)
1426{ 1430{
1427 pnfs_destroy_layout(NFS_I(inode));
1428 truncate_inode_pages(&inode->i_data, 0); 1431 truncate_inode_pages(&inode->i_data, 0);
1429 end_writeback(inode); 1432 end_writeback(inode);
1433 pnfs_return_layout(inode);
1434 pnfs_destroy_layout(NFS_I(inode));
1430 /* If we are holding a delegation, return it! */ 1435 /* If we are holding a delegation, return it! */
1431 nfs_inode_return_delegation_noreclaim(inode); 1436 nfs_inode_return_delegation_noreclaim(inode);
1432 /* First call standard NFS clear_inode() code */ 1437 /* First call standard NFS clear_inode() code */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 2df6ca7b5898..b9056cbe68d6 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -310,6 +310,7 @@ extern int nfs_migrate_page(struct address_space *,
310#endif 310#endif
311 311
312/* nfs4proc.c */ 312/* nfs4proc.c */
313extern void __nfs4_read_done_cb(struct nfs_read_data *);
313extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data); 314extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
314extern int nfs4_init_client(struct nfs_client *clp, 315extern int nfs4_init_client(struct nfs_client *clp,
315 const struct rpc_timeout *timeparms, 316 const struct rpc_timeout *timeparms,
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index be79dc9f386d..426908809c97 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -421,6 +421,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
421 struct nfs4_deviceid *id, 421 struct nfs4_deviceid *id,
422 gfp_t gfp_flags) 422 gfp_t gfp_flags)
423{ 423{
424 struct nfs4_deviceid_node *d;
424 struct nfs4_file_layout_dsaddr *dsaddr; 425 struct nfs4_file_layout_dsaddr *dsaddr;
425 int status = -EINVAL; 426 int status = -EINVAL;
426 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode); 427 struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
@@ -428,7 +429,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
428 dprintk("--> %s\n", __func__); 429 dprintk("--> %s\n", __func__);
429 430
430 if (fl->pattern_offset > lgr->range.offset) { 431 if (fl->pattern_offset > lgr->range.offset) {
431 dprintk("%s pattern_offset %lld to large\n", 432 dprintk("%s pattern_offset %lld too large\n",
432 __func__, fl->pattern_offset); 433 __func__, fl->pattern_offset);
433 goto out; 434 goto out;
434 } 435 }
@@ -440,12 +441,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
440 } 441 }
441 442
442 /* find and reference the deviceid */ 443 /* find and reference the deviceid */
443 dsaddr = nfs4_fl_find_get_deviceid(id); 444 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld,
444 if (dsaddr == NULL) { 445 NFS_SERVER(lo->plh_inode)->nfs_client, id);
446 if (d == NULL) {
445 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags); 447 dsaddr = get_device_info(lo->plh_inode, id, gfp_flags);
446 if (dsaddr == NULL) 448 if (dsaddr == NULL)
447 goto out; 449 goto out;
448 } 450 } else
451 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
449 fl->dsaddr = dsaddr; 452 fl->dsaddr = dsaddr;
450 453
451 if (fl->first_stripe_index < 0 || 454 if (fl->first_stripe_index < 0 ||
@@ -507,12 +510,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
507 gfp_t gfp_flags) 510 gfp_t gfp_flags)
508{ 511{
509 struct xdr_stream stream; 512 struct xdr_stream stream;
510 struct xdr_buf buf = { 513 struct xdr_buf buf;
511 .pages = lgr->layoutp->pages,
512 .page_len = lgr->layoutp->len,
513 .buflen = lgr->layoutp->len,
514 .len = lgr->layoutp->len,
515 };
516 struct page *scratch; 514 struct page *scratch;
517 __be32 *p; 515 __be32 *p;
518 uint32_t nfl_util; 516 uint32_t nfl_util;
@@ -524,7 +522,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
524 if (!scratch) 522 if (!scratch)
525 return -ENOMEM; 523 return -ENOMEM;
526 524
527 xdr_init_decode(&stream, &buf, NULL); 525 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
528 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 526 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
529 527
530 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), 528 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
@@ -535,7 +533,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
535 533
536 memcpy(id, p, sizeof(*id)); 534 memcpy(id, p, sizeof(*id));
537 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 535 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
538 print_deviceid(id); 536 nfs4_print_deviceid(id);
539 537
540 nfl_util = be32_to_cpup(p++); 538 nfl_util = be32_to_cpup(p++);
541 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS) 539 if (nfl_util & NFL4_UFLG_COMMIT_THRU_MDS)
@@ -653,16 +651,19 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
653/* 651/*
654 * filelayout_pg_test(). Called by nfs_can_coalesce_requests() 652 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
655 * 653 *
656 * return 1 : coalesce page 654 * return true : coalesce page
657 * return 0 : don't coalesce page 655 * return false : don't coalesce page
658 */ 656 */
659int 657bool
660filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 658filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
661 struct nfs_page *req) 659 struct nfs_page *req)
662{ 660{
663 u64 p_stripe, r_stripe; 661 u64 p_stripe, r_stripe;
664 u32 stripe_unit; 662 u32 stripe_unit;
665 663
664 if (!pnfs_generic_pg_test(pgio, prev, req))
665 return 0;
666
666 if (!pgio->pg_lseg) 667 if (!pgio->pg_lseg)
667 return 1; 668 return 1;
668 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT; 669 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
@@ -860,6 +861,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
860 return -ENOMEM; 861 return -ENOMEM;
861} 862}
862 863
864static void
865filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
866{
867 nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node));
868}
869
863static struct pnfs_layoutdriver_type filelayout_type = { 870static struct pnfs_layoutdriver_type filelayout_type = {
864 .id = LAYOUT_NFSV4_1_FILES, 871 .id = LAYOUT_NFSV4_1_FILES,
865 .name = "LAYOUT_NFSV4_1_FILES", 872 .name = "LAYOUT_NFSV4_1_FILES",
@@ -872,6 +879,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
872 .commit_pagelist = filelayout_commit_pagelist, 879 .commit_pagelist = filelayout_commit_pagelist,
873 .read_pagelist = filelayout_read_pagelist, 880 .read_pagelist = filelayout_read_pagelist,
874 .write_pagelist = filelayout_write_pagelist, 881 .write_pagelist = filelayout_write_pagelist,
882 .free_deviceid_node = filelayout_free_deveiceid_node,
875}; 883};
876 884
877static int __init nfs4filelayout_init(void) 885static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index 2b461d77b43a..cebe01e3795e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -59,9 +59,7 @@ struct nfs4_pnfs_ds {
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001 59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60 60
61struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
62 struct hlist_node node; 62 struct nfs4_deviceid_node id_node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags; 63 unsigned long flags;
66 u32 stripe_count; 64 u32 stripe_count;
67 u8 *stripe_indices; 65 u8 *stripe_indices;
@@ -95,14 +93,12 @@ extern struct nfs_fh *
95nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j); 93nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
96 94
97extern void print_ds(struct nfs4_pnfs_ds *ds); 95extern void print_ds(struct nfs4_pnfs_ds *ds);
98extern void print_deviceid(struct nfs4_deviceid *dev_id);
99u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); 96u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
100u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 97u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
101struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 98struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
102 u32 ds_idx); 99 u32 ds_idx);
103extern struct nfs4_file_layout_dsaddr *
104nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
105extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 100extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
101extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
106struct nfs4_file_layout_dsaddr * 102struct nfs4_file_layout_dsaddr *
107get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags); 103get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags);
108 104
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index db07c7af1395..3b7bf1377264 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,30 +37,6 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
64 * Data server cache 40 * Data server cache
65 * 41 *
66 * Data servers can be mapped to different device ids. 42 * Data servers can be mapped to different device ids.
@@ -89,27 +65,6 @@ print_ds(struct nfs4_pnfs_ds *ds)
89 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); 65 ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
90} 66}
91 67
92void
93print_ds_list(struct nfs4_file_layout_dsaddr *dsaddr)
94{
95 int i;
96
97 ifdebug(FACILITY) {
98 printk("%s dsaddr->ds_num %d\n", __func__,
99 dsaddr->ds_num);
100 for (i = 0; i < dsaddr->ds_num; i++)
101 print_ds(dsaddr->ds_list[i]);
102 }
103}
104
105void print_deviceid(struct nfs4_deviceid *id)
106{
107 u32 *p = (u32 *)id;
108
109 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
110 p[0], p[1], p[2], p[3]);
111}
112
113/* nfs4_ds_cache_lock is held */ 68/* nfs4_ds_cache_lock is held */
114static struct nfs4_pnfs_ds * 69static struct nfs4_pnfs_ds *
115_data_server_lookup_locked(u32 ip_addr, u32 port) 70_data_server_lookup_locked(u32 ip_addr, u32 port)
@@ -201,13 +156,13 @@ destroy_ds(struct nfs4_pnfs_ds *ds)
201 kfree(ds); 156 kfree(ds);
202} 157}
203 158
204static void 159void
205nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 160nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
206{ 161{
207 struct nfs4_pnfs_ds *ds; 162 struct nfs4_pnfs_ds *ds;
208 int i; 163 int i;
209 164
210 print_deviceid(&dsaddr->deviceid); 165 nfs4_print_deviceid(&dsaddr->id_node.deviceid);
211 166
212 for (i = 0; i < dsaddr->ds_num; i++) { 167 for (i = 0; i < dsaddr->ds_num; i++) {
213 ds = dsaddr->ds_list[i]; 168 ds = dsaddr->ds_list[i];
@@ -353,12 +308,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
353 u8 max_stripe_index; 308 u8 max_stripe_index;
354 struct nfs4_file_layout_dsaddr *dsaddr = NULL; 309 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
355 struct xdr_stream stream; 310 struct xdr_stream stream;
356 struct xdr_buf buf = { 311 struct xdr_buf buf;
357 .pages = pdev->pages,
358 .page_len = pdev->pglen,
359 .buflen = pdev->pglen,
360 .len = pdev->pglen,
361 };
362 struct page *scratch; 312 struct page *scratch;
363 313
364 /* set up xdr stream */ 314 /* set up xdr stream */
@@ -366,7 +316,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
366 if (!scratch) 316 if (!scratch)
367 goto out_err; 317 goto out_err;
368 318
369 xdr_init_decode(&stream, &buf, NULL); 319 xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
370 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 320 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
371 321
372 /* Get the stripe count (number of stripe index) */ 322 /* Get the stripe count (number of stripe index) */
@@ -431,8 +381,10 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
431 dsaddr->stripe_indices = stripe_indices; 381 dsaddr->stripe_indices = stripe_indices;
432 stripe_indices = NULL; 382 stripe_indices = NULL;
433 dsaddr->ds_num = num; 383 dsaddr->ds_num = num;
434 384 nfs4_init_deviceid_node(&dsaddr->id_node,
435 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id)); 385 NFS_SERVER(ino)->pnfs_curr_ld,
386 NFS_SERVER(ino)->nfs_client,
387 &pdev->dev_id);
436 388
437 for (i = 0; i < dsaddr->ds_num; i++) { 389 for (i = 0; i < dsaddr->ds_num; i++) {
438 int j; 390 int j;
@@ -505,8 +457,8 @@ out_err:
505static struct nfs4_file_layout_dsaddr * 457static struct nfs4_file_layout_dsaddr *
506decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) 458decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
507{ 459{
508 struct nfs4_file_layout_dsaddr *d, *new; 460 struct nfs4_deviceid_node *d;
509 long hash; 461 struct nfs4_file_layout_dsaddr *n, *new;
510 462
511 new = decode_device(inode, dev, gfp_flags); 463 new = decode_device(inode, dev, gfp_flags);
512 if (!new) { 464 if (!new) {
@@ -515,20 +467,13 @@ decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_fl
515 return NULL; 467 return NULL;
516 } 468 }
517 469
518 spin_lock(&filelayout_deviceid_lock); 470 d = nfs4_insert_deviceid_node(&new->id_node);
519 d = nfs4_fl_find_get_deviceid(&new->deviceid); 471 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
520 if (d) { 472 if (n != new) {
521 spin_unlock(&filelayout_deviceid_lock);
522 nfs4_fl_free_deviceid(new); 473 nfs4_fl_free_deviceid(new);
523 return d; 474 return n;
524 } 475 }
525 476
526 INIT_HLIST_NODE(&new->node);
527 atomic_set(&new->ref, 1);
528 hash = nfs4_fl_deviceid_hash(&new->deviceid);
529 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
530 spin_unlock(&filelayout_deviceid_lock);
531
532 return new; 477 return new;
533} 478}
534 479
@@ -600,35 +545,7 @@ out_free:
600void 545void
601nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 546nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
602{ 547{
603 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) { 548 nfs4_put_deviceid_node(&dsaddr->id_node);
604 hlist_del_rcu(&dsaddr->node);
605 spin_unlock(&filelayout_deviceid_lock);
606
607 synchronize_rcu();
608 nfs4_fl_free_deviceid(dsaddr);
609 }
610}
611
612struct nfs4_file_layout_dsaddr *
613nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
614{
615 struct nfs4_file_layout_dsaddr *d;
616 struct hlist_node *n;
617 long hash = nfs4_fl_deviceid_hash(id);
618
619
620 rcu_read_lock();
621 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
622 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
623 if (!atomic_inc_not_zero(&d->ref))
624 goto fail;
625 rcu_read_unlock();
626 return d;
627 }
628 }
629fail:
630 rcu_read_unlock();
631 return NULL;
632} 549}
633 550
634/* 551/*
@@ -676,15 +593,15 @@ static void
676filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr, 593filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
677 int err, u32 ds_addr) 594 int err, u32 ds_addr)
678{ 595{
679 u32 *p = (u32 *)&dsaddr->deviceid; 596 u32 *p = (u32 *)&dsaddr->id_node.deviceid;
680 597
681 printk(KERN_ERR "NFS: data server %x connection error %d." 598 printk(KERN_ERR "NFS: data server %x connection error %d."
682 " Deviceid [%x%x%x%x] marked out of use.\n", 599 " Deviceid [%x%x%x%x] marked out of use.\n",
683 ds_addr, err, p[0], p[1], p[2], p[3]); 600 ds_addr, err, p[0], p[1], p[2], p[3]);
684 601
685 spin_lock(&filelayout_deviceid_lock); 602 spin_lock(&nfs4_ds_cache_lock);
686 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY; 603 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
687 spin_unlock(&filelayout_deviceid_lock); 604 spin_unlock(&nfs4_ds_cache_lock);
688} 605}
689 606
690struct nfs4_pnfs_ds * 607struct nfs4_pnfs_ds *
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf1b339c3937..d2c4b59c896d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -267,9 +267,11 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
267 break; 267 break;
268 nfs4_schedule_stateid_recovery(server, state); 268 nfs4_schedule_stateid_recovery(server, state);
269 goto wait_on_recovery; 269 goto wait_on_recovery;
270 case -NFS4ERR_EXPIRED:
271 if (state != NULL)
272 nfs4_schedule_stateid_recovery(server, state);
270 case -NFS4ERR_STALE_STATEID: 273 case -NFS4ERR_STALE_STATEID:
271 case -NFS4ERR_STALE_CLIENTID: 274 case -NFS4ERR_STALE_CLIENTID:
272 case -NFS4ERR_EXPIRED:
273 nfs4_schedule_lease_recovery(clp); 275 nfs4_schedule_lease_recovery(clp);
274 goto wait_on_recovery; 276 goto wait_on_recovery;
275#if defined(CONFIG_NFS_V4_1) 277#if defined(CONFIG_NFS_V4_1)
@@ -2361,6 +2363,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2361 struct nfs4_state *state = NULL; 2363 struct nfs4_state *state = NULL;
2362 int status; 2364 int status;
2363 2365
2366 if (pnfs_ld_layoutret_on_setattr(inode))
2367 pnfs_return_layout(inode);
2368
2364 nfs_fattr_init(fattr); 2369 nfs_fattr_init(fattr);
2365 2370
2366 /* Search for an existing open(O_WRITE) file */ 2371 /* Search for an existing open(O_WRITE) file */
@@ -3175,6 +3180,11 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3175 return err; 3180 return err;
3176} 3181}
3177 3182
3183void __nfs4_read_done_cb(struct nfs_read_data *data)
3184{
3185 nfs_invalidate_atime(data->inode);
3186}
3187
3178static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data) 3188static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3179{ 3189{
3180 struct nfs_server *server = NFS_SERVER(data->inode); 3190 struct nfs_server *server = NFS_SERVER(data->inode);
@@ -3184,7 +3194,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3184 return -EAGAIN; 3194 return -EAGAIN;
3185 } 3195 }
3186 3196
3187 nfs_invalidate_atime(data->inode); 3197 __nfs4_read_done_cb(data);
3188 if (task->tk_status > 0) 3198 if (task->tk_status > 0)
3189 renew_lease(server, data->timestamp); 3199 renew_lease(server, data->timestamp);
3190 return 0; 3200 return 0;
@@ -3198,7 +3208,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3198 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3208 if (!nfs4_sequence_done(task, &data->res.seq_res))
3199 return -EAGAIN; 3209 return -EAGAIN;
3200 3210
3201 return data->read_done_cb(task, data); 3211 return data->read_done_cb ? data->read_done_cb(task, data) :
3212 nfs4_read_done_cb(task, data);
3202} 3213}
3203 3214
3204static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3215static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
@@ -3243,7 +3254,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3243{ 3254{
3244 if (!nfs4_sequence_done(task, &data->res.seq_res)) 3255 if (!nfs4_sequence_done(task, &data->res.seq_res))
3245 return -EAGAIN; 3256 return -EAGAIN;
3246 return data->write_done_cb(task, data); 3257 return data->write_done_cb ? data->write_done_cb(task, data) :
3258 nfs4_write_done_cb(task, data);
3247} 3259}
3248 3260
3249/* Reset the the nfs_write_data to send the write to the MDS. */ 3261/* Reset the the nfs_write_data to send the write to the MDS. */
@@ -3670,9 +3682,11 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3670 break; 3682 break;
3671 nfs4_schedule_stateid_recovery(server, state); 3683 nfs4_schedule_stateid_recovery(server, state);
3672 goto wait_on_recovery; 3684 goto wait_on_recovery;
3685 case -NFS4ERR_EXPIRED:
3686 if (state != NULL)
3687 nfs4_schedule_stateid_recovery(server, state);
3673 case -NFS4ERR_STALE_STATEID: 3688 case -NFS4ERR_STALE_STATEID:
3674 case -NFS4ERR_STALE_CLIENTID: 3689 case -NFS4ERR_STALE_CLIENTID:
3675 case -NFS4ERR_EXPIRED:
3676 nfs4_schedule_lease_recovery(clp); 3690 nfs4_schedule_lease_recovery(clp);
3677 goto wait_on_recovery; 3691 goto wait_on_recovery;
3678#if defined(CONFIG_NFS_V4_1) 3692#if defined(CONFIG_NFS_V4_1)
@@ -4543,6 +4557,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4543 case -ESTALE: 4557 case -ESTALE:
4544 goto out; 4558 goto out;
4545 case -NFS4ERR_EXPIRED: 4559 case -NFS4ERR_EXPIRED:
4560 nfs4_schedule_stateid_recovery(server, state);
4546 case -NFS4ERR_STALE_CLIENTID: 4561 case -NFS4ERR_STALE_CLIENTID:
4547 case -NFS4ERR_STALE_STATEID: 4562 case -NFS4ERR_STALE_STATEID:
4548 nfs4_schedule_lease_recovery(server->nfs_client); 4563 nfs4_schedule_lease_recovery(server->nfs_client);
@@ -5666,6 +5681,88 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5666 return status; 5681 return status;
5667} 5682}
5668 5683
5684static void
5685nfs4_layoutreturn_prepare(struct rpc_task *task, void *calldata)
5686{
5687 struct nfs4_layoutreturn *lrp = calldata;
5688
5689 dprintk("--> %s\n", __func__);
5690 if (nfs41_setup_sequence(lrp->clp->cl_session, &lrp->args.seq_args,
5691 &lrp->res.seq_res, 0, task))
5692 return;
5693 rpc_call_start(task);
5694}
5695
5696static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
5697{
5698 struct nfs4_layoutreturn *lrp = calldata;
5699 struct nfs_server *server;
5700
5701 dprintk("--> %s\n", __func__);
5702
5703 if (!nfs4_sequence_done(task, &lrp->res.seq_res))
5704 return;
5705
5706 server = NFS_SERVER(lrp->args.inode);
5707 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5708 nfs_restart_rpc(task, lrp->clp);
5709 return;
5710 }
5711 if (task->tk_status == 0) {
5712 struct pnfs_layout_hdr *lo = NFS_I(lrp->args.inode)->layout;
5713
5714 if (lrp->res.lrs_present) {
5715 spin_lock(&lo->plh_inode->i_lock);
5716 pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
5717 spin_unlock(&lo->plh_inode->i_lock);
5718 } else
5719 BUG_ON(!list_empty(&lo->plh_segs));
5720 }
5721 dprintk("<-- %s\n", __func__);
5722}
5723
5724static void nfs4_layoutreturn_release(void *calldata)
5725{
5726 struct nfs4_layoutreturn *lrp = calldata;
5727
5728 dprintk("--> %s\n", __func__);
5729 put_layout_hdr(NFS_I(lrp->args.inode)->layout);
5730 kfree(calldata);
5731 dprintk("<-- %s\n", __func__);
5732}
5733
5734static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
5735 .rpc_call_prepare = nfs4_layoutreturn_prepare,
5736 .rpc_call_done = nfs4_layoutreturn_done,
5737 .rpc_release = nfs4_layoutreturn_release,
5738};
5739
5740int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
5741{
5742 struct rpc_task *task;
5743 struct rpc_message msg = {
5744 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTRETURN],
5745 .rpc_argp = &lrp->args,
5746 .rpc_resp = &lrp->res,
5747 };
5748 struct rpc_task_setup task_setup_data = {
5749 .rpc_client = lrp->clp->cl_rpcclient,
5750 .rpc_message = &msg,
5751 .callback_ops = &nfs4_layoutreturn_call_ops,
5752 .callback_data = lrp,
5753 };
5754 int status;
5755
5756 dprintk("--> %s\n", __func__);
5757 task = rpc_run_task(&task_setup_data);
5758 if (IS_ERR(task))
5759 return PTR_ERR(task);
5760 status = task->tk_status;
5761 dprintk("<-- %s status=%d\n", __func__, status);
5762 rpc_put_task(task);
5763 return status;
5764}
5765
5669static int 5766static int
5670_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev) 5767_nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5671{ 5768{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 036f5adc9e1f..e97dd219f84f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1466,7 +1466,10 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1466#ifdef CONFIG_NFS_V4_1 1466#ifdef CONFIG_NFS_V4_1
1467void nfs4_schedule_session_recovery(struct nfs4_session *session) 1467void nfs4_schedule_session_recovery(struct nfs4_session *session)
1468{ 1468{
1469 nfs4_schedule_lease_recovery(session->clp); 1469 struct nfs_client *clp = session->clp;
1470
1471 set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1472 nfs4_schedule_lease_recovery(clp);
1470} 1473}
1471EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); 1474EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1472 1475
@@ -1549,6 +1552,7 @@ static int nfs4_reset_session(struct nfs_client *clp)
1549 status = nfs4_recovery_handle_error(clp, status); 1552 status = nfs4_recovery_handle_error(clp, status);
1550 goto out; 1553 goto out;
1551 } 1554 }
1555 clear_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
1552 /* create_session negotiated new slot table */ 1556 /* create_session negotiated new slot table */
1553 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1557 clear_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1554 1558
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c3ccd2c46834..d869a5e5464b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -338,7 +338,11 @@ static int nfs4_stat_to_errno(int);
338 1 /* layoutupdate4 layout type */ + \ 338 1 /* layoutupdate4 layout type */ + \
339 1 /* NULL filelayout layoutupdate4 payload */) 339 1 /* NULL filelayout layoutupdate4 payload */)
340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
341 341#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
342 encode_stateid_maxsz + \
343 1 /* FIXME: opaque lrf_body always empty at the moment */)
344#define decode_layoutreturn_maxsz (op_decode_hdr_maxsz + \
345 1 + decode_stateid_maxsz)
342#else /* CONFIG_NFS_V4_1 */ 346#else /* CONFIG_NFS_V4_1 */
343#define encode_sequence_maxsz 0 347#define encode_sequence_maxsz 0
344#define decode_sequence_maxsz 0 348#define decode_sequence_maxsz 0
@@ -760,7 +764,14 @@ static int nfs4_stat_to_errno(int);
760 decode_putfh_maxsz + \ 764 decode_putfh_maxsz + \
761 decode_layoutcommit_maxsz + \ 765 decode_layoutcommit_maxsz + \
762 decode_getattr_maxsz) 766 decode_getattr_maxsz)
763 767#define NFS4_enc_layoutreturn_sz (compound_encode_hdr_maxsz + \
768 encode_sequence_maxsz + \
769 encode_putfh_maxsz + \
770 encode_layoutreturn_maxsz)
771#define NFS4_dec_layoutreturn_sz (compound_decode_hdr_maxsz + \
772 decode_sequence_maxsz + \
773 decode_putfh_maxsz + \
774 decode_layoutreturn_maxsz)
764 775
765const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 776const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
766 compound_encode_hdr_maxsz + 777 compound_encode_hdr_maxsz +
@@ -1864,6 +1875,7 @@ encode_layoutget(struct xdr_stream *xdr,
1864 1875
1865static int 1876static int
1866encode_layoutcommit(struct xdr_stream *xdr, 1877encode_layoutcommit(struct xdr_stream *xdr,
1878 struct inode *inode,
1867 const struct nfs4_layoutcommit_args *args, 1879 const struct nfs4_layoutcommit_args *args,
1868 struct compound_hdr *hdr) 1880 struct compound_hdr *hdr)
1869{ 1881{
@@ -1872,7 +1884,7 @@ encode_layoutcommit(struct xdr_stream *xdr,
1872 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten, 1884 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1873 NFS_SERVER(args->inode)->pnfs_curr_ld->id); 1885 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1874 1886
1875 p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE); 1887 p = reserve_space(xdr, 44 + NFS4_STATEID_SIZE);
1876 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT); 1888 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1877 /* Only whole file layouts */ 1889 /* Only whole file layouts */
1878 p = xdr_encode_hyper(p, 0); /* offset */ 1890 p = xdr_encode_hyper(p, 0); /* offset */
@@ -1883,12 +1895,49 @@ encode_layoutcommit(struct xdr_stream *xdr,
1883 p = xdr_encode_hyper(p, args->lastbytewritten); 1895 p = xdr_encode_hyper(p, args->lastbytewritten);
1884 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1896 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1885 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1897 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1886 *p++ = cpu_to_be32(0); /* no file layout payload */ 1898
1899 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit)
1900 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
1901 NFS_I(inode)->layout, xdr, args);
1902 else {
1903 p = reserve_space(xdr, 4);
1904 *p = cpu_to_be32(0); /* no layout-type payload */
1905 }
1887 1906
1888 hdr->nops++; 1907 hdr->nops++;
1889 hdr->replen += decode_layoutcommit_maxsz; 1908 hdr->replen += decode_layoutcommit_maxsz;
1890 return 0; 1909 return 0;
1891} 1910}
1911
1912static void
1913encode_layoutreturn(struct xdr_stream *xdr,
1914 const struct nfs4_layoutreturn_args *args,
1915 struct compound_hdr *hdr)
1916{
1917 __be32 *p;
1918
1919 p = reserve_space(xdr, 20);
1920 *p++ = cpu_to_be32(OP_LAYOUTRETURN);
1921 *p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
1922 *p++ = cpu_to_be32(args->layout_type);
1923 *p++ = cpu_to_be32(IOMODE_ANY);
1924 *p = cpu_to_be32(RETURN_FILE);
1925 p = reserve_space(xdr, 16 + NFS4_STATEID_SIZE);
1926 p = xdr_encode_hyper(p, 0);
1927 p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
1928 spin_lock(&args->inode->i_lock);
1929 xdr_encode_opaque_fixed(p, &args->stateid.data, NFS4_STATEID_SIZE);
1930 spin_unlock(&args->inode->i_lock);
1931 if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
1932 NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
1933 NFS_I(args->inode)->layout, xdr, args);
1934 } else {
1935 p = reserve_space(xdr, 4);
1936 *p = cpu_to_be32(0);
1937 }
1938 hdr->nops++;
1939 hdr->replen += decode_layoutreturn_maxsz;
1940}
1892#endif /* CONFIG_NFS_V4_1 */ 1941#endif /* CONFIG_NFS_V4_1 */
1893 1942
1894/* 1943/*
@@ -2706,10 +2755,12 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2706/* 2755/*
2707 * Encode LAYOUTCOMMIT request 2756 * Encode LAYOUTCOMMIT request
2708 */ 2757 */
2709static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req, 2758static void nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2710 struct xdr_stream *xdr, 2759 struct xdr_stream *xdr,
2711 struct nfs4_layoutcommit_args *args) 2760 struct nfs4_layoutcommit_args *args)
2712{ 2761{
2762 struct nfs4_layoutcommit_data *data =
2763 container_of(args, struct nfs4_layoutcommit_data, args);
2713 struct compound_hdr hdr = { 2764 struct compound_hdr hdr = {
2714 .minorversion = nfs4_xdr_minorversion(&args->seq_args), 2765 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2715 }; 2766 };
@@ -2717,10 +2768,27 @@ static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2717 encode_compound_hdr(xdr, req, &hdr); 2768 encode_compound_hdr(xdr, req, &hdr);
2718 encode_sequence(xdr, &args->seq_args, &hdr); 2769 encode_sequence(xdr, &args->seq_args, &hdr);
2719 encode_putfh(xdr, NFS_FH(args->inode), &hdr); 2770 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2720 encode_layoutcommit(xdr, args, &hdr); 2771 encode_layoutcommit(xdr, data->args.inode, args, &hdr);
2721 encode_getfattr(xdr, args->bitmask, &hdr); 2772 encode_getfattr(xdr, args->bitmask, &hdr);
2722 encode_nops(&hdr); 2773 encode_nops(&hdr);
2723 return 0; 2774}
2775
2776/*
2777 * Encode LAYOUTRETURN request
2778 */
2779static void nfs4_xdr_enc_layoutreturn(struct rpc_rqst *req,
2780 struct xdr_stream *xdr,
2781 struct nfs4_layoutreturn_args *args)
2782{
2783 struct compound_hdr hdr = {
2784 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2785 };
2786
2787 encode_compound_hdr(xdr, req, &hdr);
2788 encode_sequence(xdr, &args->seq_args, &hdr);
2789 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2790 encode_layoutreturn(xdr, args, &hdr);
2791 encode_nops(&hdr);
2724} 2792}
2725#endif /* CONFIG_NFS_V4_1 */ 2793#endif /* CONFIG_NFS_V4_1 */
2726 2794
@@ -5203,6 +5271,27 @@ out_overflow:
5203 return -EIO; 5271 return -EIO;
5204} 5272}
5205 5273
5274static int decode_layoutreturn(struct xdr_stream *xdr,
5275 struct nfs4_layoutreturn_res *res)
5276{
5277 __be32 *p;
5278 int status;
5279
5280 status = decode_op_hdr(xdr, OP_LAYOUTRETURN);
5281 if (status)
5282 return status;
5283 p = xdr_inline_decode(xdr, 4);
5284 if (unlikely(!p))
5285 goto out_overflow;
5286 res->lrs_present = be32_to_cpup(p);
5287 if (res->lrs_present)
5288 status = decode_stateid(xdr, &res->stateid);
5289 return status;
5290out_overflow:
5291 print_overflow_msg(__func__, xdr);
5292 return -EIO;
5293}
5294
5206static int decode_layoutcommit(struct xdr_stream *xdr, 5295static int decode_layoutcommit(struct xdr_stream *xdr,
5207 struct rpc_rqst *req, 5296 struct rpc_rqst *req,
5208 struct nfs4_layoutcommit_res *res) 5297 struct nfs4_layoutcommit_res *res)
@@ -6320,6 +6409,30 @@ out:
6320} 6409}
6321 6410
6322/* 6411/*
6412 * Decode LAYOUTRETURN response
6413 */
6414static int nfs4_xdr_dec_layoutreturn(struct rpc_rqst *rqstp,
6415 struct xdr_stream *xdr,
6416 struct nfs4_layoutreturn_res *res)
6417{
6418 struct compound_hdr hdr;
6419 int status;
6420
6421 status = decode_compound_hdr(xdr, &hdr);
6422 if (status)
6423 goto out;
6424 status = decode_sequence(xdr, &res->seq_res, rqstp);
6425 if (status)
6426 goto out;
6427 status = decode_putfh(xdr);
6428 if (status)
6429 goto out;
6430 status = decode_layoutreturn(xdr, res);
6431out:
6432 return status;
6433}
6434
6435/*
6323 * Decode LAYOUTCOMMIT response 6436 * Decode LAYOUTCOMMIT response
6324 */ 6437 */
6325static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp, 6438static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
@@ -6547,6 +6660,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6547 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6660 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6548 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6661 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6549 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit), 6662 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6663 PROC(LAYOUTRETURN, enc_layoutreturn, dec_layoutreturn),
6550#endif /* CONFIG_NFS_V4_1 */ 6664#endif /* CONFIG_NFS_V4_1 */
6551}; 6665};
6552 6666
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index c541093a5bf2..c4744e1d513c 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -87,7 +87,7 @@
87#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
88 88
89/* Default NFSROOT mount options. */ 89/* Default NFSROOT mount options. */
90#define NFS_DEF_OPTIONS "udp" 90#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
91 91
92/* Parameters passed from the kernel command line */ 92/* Parameters passed from the kernel command line */
93static char nfs_root_parms[256] __initdata = ""; 93static char nfs_root_parms[256] __initdata = "";
diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild
new file mode 100644
index 000000000000..ed30ea072bb8
--- /dev/null
+++ b/fs/nfs/objlayout/Kbuild
@@ -0,0 +1,5 @@
1#
2# Makefile for the pNFS Objects Layout Driver kernel module
3#
4objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o
5obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
new file mode 100644
index 000000000000..9cf208df1f25
--- /dev/null
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -0,0 +1,1057 @@
1/*
2 * pNFS Objects layout implementation over open-osd initiator library
3 *
4 * Copyright (C) 2009 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/module.h>
41#include <scsi/osd_initiator.h>
42
43#include "objlayout.h"
44
45#define NFSDBG_FACILITY NFSDBG_PNFS_LD
46
47#define _LLU(x) ((unsigned long long)x)
48
49enum { BIO_MAX_PAGES_KMALLOC =
50 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
51};
52
53struct objio_dev_ent {
54 struct nfs4_deviceid_node id_node;
55 struct osd_dev *od;
56};
57
58static void
59objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60{
61 struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node);
62
63 dprintk("%s: free od=%p\n", __func__, de->od);
64 osduld_put_device(de->od);
65 kfree(de);
66}
67
68static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
69 const struct nfs4_deviceid *d_id)
70{
71 struct nfs4_deviceid_node *d;
72 struct objio_dev_ent *de;
73
74 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
75 if (!d)
76 return NULL;
77
78 de = container_of(d, struct objio_dev_ent, id_node);
79 return de;
80}
81
82static struct objio_dev_ent *
83_dev_list_add(const struct nfs_server *nfss,
84 const struct nfs4_deviceid *d_id, struct osd_dev *od,
85 gfp_t gfp_flags)
86{
87 struct nfs4_deviceid_node *d;
88 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
89 struct objio_dev_ent *n;
90
91 if (!de) {
92 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
93 return NULL;
94 }
95
96 dprintk("%s: Adding od=%p\n", __func__, od);
97 nfs4_init_deviceid_node(&de->id_node,
98 nfss->pnfs_curr_ld,
99 nfss->nfs_client,
100 d_id);
101 de->od = od;
102
103 d = nfs4_insert_deviceid_node(&de->id_node);
104 n = container_of(d, struct objio_dev_ent, id_node);
105 if (n != de) {
106 dprintk("%s: Race with other n->od=%p\n", __func__, n->od);
107 objio_free_deviceid_node(&de->id_node);
108 de = n;
109 }
110
111 atomic_inc(&de->id_node.ref);
112 return de;
113}
114
115struct caps_buffers {
116 u8 caps_key[OSD_CRYPTO_KEYID_SIZE];
117 u8 creds[OSD_CAP_LEN];
118};
119
120struct objio_segment {
121 struct pnfs_layout_segment lseg;
122
123 struct pnfs_osd_object_cred *comps;
124
125 unsigned mirrors_p1;
126 unsigned stripe_unit;
127 unsigned group_width; /* Data stripe_units without integrity comps */
128 u64 group_depth;
129 unsigned group_count;
130
131 unsigned max_io_size;
132
133 unsigned comps_index;
134 unsigned num_comps;
135 /* variable length */
136 struct objio_dev_ent *ods[];
137};
138
139static inline struct objio_segment *
140OBJIO_LSEG(struct pnfs_layout_segment *lseg)
141{
142 return container_of(lseg, struct objio_segment, lseg);
143}
144
145struct objio_state;
146typedef ssize_t (*objio_done_fn)(struct objio_state *ios);
147
148struct objio_state {
149 /* Generic layer */
150 struct objlayout_io_state ol_state;
151
152 struct objio_segment *layout;
153
154 struct kref kref;
155 objio_done_fn done;
156 void *private;
157
158 unsigned long length;
159 unsigned numdevs; /* Actually used devs in this IO */
160 /* A per-device variable array of size numdevs */
161 struct _objio_per_comp {
162 struct bio *bio;
163 struct osd_request *or;
164 unsigned long length;
165 u64 offset;
166 unsigned dev;
167 } per_dev[];
168};
169
170/* Send and wait for a get_device_info of devices in the layout,
171 then look them up with the osd_initiator library */
172static struct objio_dev_ent *_device_lookup(struct pnfs_layout_hdr *pnfslay,
173 struct objio_segment *objio_seg, unsigned comp,
174 gfp_t gfp_flags)
175{
176 struct pnfs_osd_deviceaddr *deviceaddr;
177 struct nfs4_deviceid *d_id;
178 struct objio_dev_ent *ode;
179 struct osd_dev *od;
180 struct osd_dev_info odi;
181 int err;
182
183 d_id = &objio_seg->comps[comp].oc_object_id.oid_device_id;
184
185 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id);
186 if (ode)
187 return ode;
188
189 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags);
190 if (unlikely(err)) {
191 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
192 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
193 return ERR_PTR(err);
194 }
195
196 odi.systemid_len = deviceaddr->oda_systemid.len;
197 if (odi.systemid_len > sizeof(odi.systemid)) {
198 err = -EINVAL;
199 goto out;
200 } else if (odi.systemid_len)
201 memcpy(odi.systemid, deviceaddr->oda_systemid.data,
202 odi.systemid_len);
203 odi.osdname_len = deviceaddr->oda_osdname.len;
204 odi.osdname = (u8 *)deviceaddr->oda_osdname.data;
205
206 if (!odi.osdname_len && !odi.systemid_len) {
207 dprintk("%s: !odi.osdname_len && !odi.systemid_len\n",
208 __func__);
209 err = -ENODEV;
210 goto out;
211 }
212
213 od = osduld_info_lookup(&odi);
214 if (unlikely(IS_ERR(od))) {
215 err = PTR_ERR(od);
216 dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
217 goto out;
218 }
219
220 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
221 gfp_flags);
222
223out:
224 dprintk("%s: return=%d\n", __func__, err);
225 objlayout_put_deviceinfo(deviceaddr);
226 return err ? ERR_PTR(err) : ode;
227}
228
229static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay,
230 struct objio_segment *objio_seg,
231 gfp_t gfp_flags)
232{
233 unsigned i;
234 int err;
235
236 /* lookup all devices */
237 for (i = 0; i < objio_seg->num_comps; i++) {
238 struct objio_dev_ent *ode;
239
240 ode = _device_lookup(pnfslay, objio_seg, i, gfp_flags);
241 if (unlikely(IS_ERR(ode))) {
242 err = PTR_ERR(ode);
243 goto out;
244 }
245 objio_seg->ods[i] = ode;
246 }
247 err = 0;
248
249out:
250 dprintk("%s: return=%d\n", __func__, err);
251 return err;
252}
253
254static int _verify_data_map(struct pnfs_osd_layout *layout)
255{
256 struct pnfs_osd_data_map *data_map = &layout->olo_map;
257 u64 stripe_length;
258 u32 group_width;
259
260/* FIXME: Only raid0 for now. if not go through MDS */
261 if (data_map->odm_raid_algorithm != PNFS_OSD_RAID_0) {
262 printk(KERN_ERR "Only RAID_0 for now\n");
263 return -ENOTSUPP;
264 }
265 if (0 != (data_map->odm_num_comps % (data_map->odm_mirror_cnt + 1))) {
266 printk(KERN_ERR "Data Map wrong, num_comps=%u mirrors=%u\n",
267 data_map->odm_num_comps, data_map->odm_mirror_cnt);
268 return -EINVAL;
269 }
270
271 if (data_map->odm_group_width)
272 group_width = data_map->odm_group_width;
273 else
274 group_width = data_map->odm_num_comps /
275 (data_map->odm_mirror_cnt + 1);
276
277 stripe_length = (u64)data_map->odm_stripe_unit * group_width;
278 if (stripe_length >= (1ULL << 32)) {
279 printk(KERN_ERR "Total Stripe length(0x%llx)"
280 " >= 32bit is not supported\n", _LLU(stripe_length));
281 return -ENOTSUPP;
282 }
283
284 if (0 != (data_map->odm_stripe_unit & ~PAGE_MASK)) {
285 printk(KERN_ERR "Stripe Unit(0x%llx)"
286 " must be Multples of PAGE_SIZE(0x%lx)\n",
287 _LLU(data_map->odm_stripe_unit), PAGE_SIZE);
288 return -ENOTSUPP;
289 }
290
291 return 0;
292}
293
294static void copy_single_comp(struct pnfs_osd_object_cred *cur_comp,
295 struct pnfs_osd_object_cred *src_comp,
296 struct caps_buffers *caps_p)
297{
298 WARN_ON(src_comp->oc_cap_key.cred_len > sizeof(caps_p->caps_key));
299 WARN_ON(src_comp->oc_cap.cred_len > sizeof(caps_p->creds));
300
301 *cur_comp = *src_comp;
302
303 memcpy(caps_p->caps_key, src_comp->oc_cap_key.cred,
304 sizeof(caps_p->caps_key));
305 cur_comp->oc_cap_key.cred = caps_p->caps_key;
306
307 memcpy(caps_p->creds, src_comp->oc_cap.cred,
308 sizeof(caps_p->creds));
309 cur_comp->oc_cap.cred = caps_p->creds;
310}
311
312int objio_alloc_lseg(struct pnfs_layout_segment **outp,
313 struct pnfs_layout_hdr *pnfslay,
314 struct pnfs_layout_range *range,
315 struct xdr_stream *xdr,
316 gfp_t gfp_flags)
317{
318 struct objio_segment *objio_seg;
319 struct pnfs_osd_xdr_decode_layout_iter iter;
320 struct pnfs_osd_layout layout;
321 struct pnfs_osd_object_cred *cur_comp, src_comp;
322 struct caps_buffers *caps_p;
323 int err;
324
325 err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr);
326 if (unlikely(err))
327 return err;
328
329 err = _verify_data_map(&layout);
330 if (unlikely(err))
331 return err;
332
333 objio_seg = kzalloc(sizeof(*objio_seg) +
334 sizeof(objio_seg->ods[0]) * layout.olo_num_comps +
335 sizeof(*objio_seg->comps) * layout.olo_num_comps +
336 sizeof(struct caps_buffers) * layout.olo_num_comps,
337 gfp_flags);
338 if (!objio_seg)
339 return -ENOMEM;
340
341 objio_seg->comps = (void *)(objio_seg->ods + layout.olo_num_comps);
342 cur_comp = objio_seg->comps;
343 caps_p = (void *)(cur_comp + layout.olo_num_comps);
344 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err))
345 copy_single_comp(cur_comp++, &src_comp, caps_p++);
346 if (unlikely(err))
347 goto err;
348
349 objio_seg->num_comps = layout.olo_num_comps;
350 objio_seg->comps_index = layout.olo_comps_index;
351 err = objio_devices_lookup(pnfslay, objio_seg, gfp_flags);
352 if (err)
353 goto err;
354
355 objio_seg->mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1;
356 objio_seg->stripe_unit = layout.olo_map.odm_stripe_unit;
357 if (layout.olo_map.odm_group_width) {
358 objio_seg->group_width = layout.olo_map.odm_group_width;
359 objio_seg->group_depth = layout.olo_map.odm_group_depth;
360 objio_seg->group_count = layout.olo_map.odm_num_comps /
361 objio_seg->mirrors_p1 /
362 objio_seg->group_width;
363 } else {
364 objio_seg->group_width = layout.olo_map.odm_num_comps /
365 objio_seg->mirrors_p1;
366 objio_seg->group_depth = -1;
367 objio_seg->group_count = 1;
368 }
369
370 /* Cache this calculation it will hit for every page */
371 objio_seg->max_io_size = (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE -
372 objio_seg->stripe_unit) *
373 objio_seg->group_width;
374
375 *outp = &objio_seg->lseg;
376 return 0;
377
378err:
379 kfree(objio_seg);
380 dprintk("%s: Error: return %d\n", __func__, err);
381 *outp = NULL;
382 return err;
383}
384
385void objio_free_lseg(struct pnfs_layout_segment *lseg)
386{
387 int i;
388 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
389
390 for (i = 0; i < objio_seg->num_comps; i++) {
391 if (!objio_seg->ods[i])
392 break;
393 nfs4_put_deviceid_node(&objio_seg->ods[i]->id_node);
394 }
395 kfree(objio_seg);
396}
397
398int objio_alloc_io_state(struct pnfs_layout_segment *lseg,
399 struct objlayout_io_state **outp,
400 gfp_t gfp_flags)
401{
402 struct objio_segment *objio_seg = OBJIO_LSEG(lseg);
403 struct objio_state *ios;
404 const unsigned first_size = sizeof(*ios) +
405 objio_seg->num_comps * sizeof(ios->per_dev[0]);
406 const unsigned sec_size = objio_seg->num_comps *
407 sizeof(ios->ol_state.ioerrs[0]);
408
409 ios = kzalloc(first_size + sec_size, gfp_flags);
410 if (unlikely(!ios))
411 return -ENOMEM;
412
413 ios->layout = objio_seg;
414 ios->ol_state.ioerrs = ((void *)ios) + first_size;
415 ios->ol_state.num_comps = objio_seg->num_comps;
416
417 *outp = &ios->ol_state;
418 return 0;
419}
420
421void objio_free_io_state(struct objlayout_io_state *ol_state)
422{
423 struct objio_state *ios = container_of(ol_state, struct objio_state,
424 ol_state);
425
426 kfree(ios);
427}
428
429enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep)
430{
431 switch (oep) {
432 case OSD_ERR_PRI_NO_ERROR:
433 return (enum pnfs_osd_errno)0;
434
435 case OSD_ERR_PRI_CLEAR_PAGES:
436 BUG_ON(1);
437 return 0;
438
439 case OSD_ERR_PRI_RESOURCE:
440 return PNFS_OSD_ERR_RESOURCE;
441 case OSD_ERR_PRI_BAD_CRED:
442 return PNFS_OSD_ERR_BAD_CRED;
443 case OSD_ERR_PRI_NO_ACCESS:
444 return PNFS_OSD_ERR_NO_ACCESS;
445 case OSD_ERR_PRI_UNREACHABLE:
446 return PNFS_OSD_ERR_UNREACHABLE;
447 case OSD_ERR_PRI_NOT_FOUND:
448 return PNFS_OSD_ERR_NOT_FOUND;
449 case OSD_ERR_PRI_NO_SPACE:
450 return PNFS_OSD_ERR_NO_SPACE;
451 default:
452 WARN_ON(1);
453 /* fallthrough */
454 case OSD_ERR_PRI_EIO:
455 return PNFS_OSD_ERR_EIO;
456 }
457}
458
459static void _clear_bio(struct bio *bio)
460{
461 struct bio_vec *bv;
462 unsigned i;
463
464 __bio_for_each_segment(bv, bio, i, 0) {
465 unsigned this_count = bv->bv_len;
466
467 if (likely(PAGE_SIZE == this_count))
468 clear_highpage(bv->bv_page);
469 else
470 zero_user(bv->bv_page, bv->bv_offset, this_count);
471 }
472}
473
474static int _io_check(struct objio_state *ios, bool is_write)
475{
476 enum osd_err_priority oep = OSD_ERR_PRI_NO_ERROR;
477 int lin_ret = 0;
478 int i;
479
480 for (i = 0; i < ios->numdevs; i++) {
481 struct osd_sense_info osi;
482 struct osd_request *or = ios->per_dev[i].or;
483 unsigned dev;
484 int ret;
485
486 if (!or)
487 continue;
488
489 ret = osd_req_decode_sense(or, &osi);
490 if (likely(!ret))
491 continue;
492
493 if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
494 /* start read offset passed endof file */
495 BUG_ON(is_write);
496 _clear_bio(ios->per_dev[i].bio);
497 dprintk("%s: start read offset passed end of file "
498 "offset=0x%llx, length=0x%lx\n", __func__,
499 _LLU(ios->per_dev[i].offset),
500 ios->per_dev[i].length);
501
502 continue; /* we recovered */
503 }
504 dev = ios->per_dev[i].dev;
505 objlayout_io_set_result(&ios->ol_state, dev,
506 &ios->layout->comps[dev].oc_object_id,
507 osd_pri_2_pnfs_err(osi.osd_err_pri),
508 ios->per_dev[i].offset,
509 ios->per_dev[i].length,
510 is_write);
511
512 if (osi.osd_err_pri >= oep) {
513 oep = osi.osd_err_pri;
514 lin_ret = ret;
515 }
516 }
517
518 return lin_ret;
519}
520
521/*
522 * Common IO state helpers.
523 */
524static void _io_free(struct objio_state *ios)
525{
526 unsigned i;
527
528 for (i = 0; i < ios->numdevs; i++) {
529 struct _objio_per_comp *per_dev = &ios->per_dev[i];
530
531 if (per_dev->or) {
532 osd_end_request(per_dev->or);
533 per_dev->or = NULL;
534 }
535
536 if (per_dev->bio) {
537 bio_put(per_dev->bio);
538 per_dev->bio = NULL;
539 }
540 }
541}
542
543struct osd_dev *_io_od(struct objio_state *ios, unsigned dev)
544{
545 unsigned min_dev = ios->layout->comps_index;
546 unsigned max_dev = min_dev + ios->layout->num_comps;
547
548 BUG_ON(dev < min_dev || max_dev <= dev);
549 return ios->layout->ods[dev - min_dev]->od;
550}
551
552struct _striping_info {
553 u64 obj_offset;
554 u64 group_length;
555 unsigned dev;
556 unsigned unit_off;
557};
558
559static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
560 struct _striping_info *si)
561{
562 u32 stripe_unit = ios->layout->stripe_unit;
563 u32 group_width = ios->layout->group_width;
564 u64 group_depth = ios->layout->group_depth;
565 u32 U = stripe_unit * group_width;
566
567 u64 T = U * group_depth;
568 u64 S = T * ios->layout->group_count;
569 u64 M = div64_u64(file_offset, S);
570
571 /*
572 G = (L - (M * S)) / T
573 H = (L - (M * S)) % T
574 */
575 u64 LmodU = file_offset - M * S;
576 u32 G = div64_u64(LmodU, T);
577 u64 H = LmodU - G * T;
578
579 u32 N = div_u64(H, U);
580
581 div_u64_rem(file_offset, stripe_unit, &si->unit_off);
582 si->obj_offset = si->unit_off + (N * stripe_unit) +
583 (M * group_depth * stripe_unit);
584
585 /* "H - (N * U)" is just "H % U" so it's bound to u32 */
586 si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width;
587 si->dev *= ios->layout->mirrors_p1;
588
589 si->group_length = T - H;
590}
591
592static int _add_stripe_unit(struct objio_state *ios, unsigned *cur_pg,
593 unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
594 gfp_t gfp_flags)
595{
596 unsigned pg = *cur_pg;
597 struct request_queue *q =
598 osd_request_queue(_io_od(ios, per_dev->dev));
599
600 per_dev->length += cur_len;
601
602 if (per_dev->bio == NULL) {
603 unsigned stripes = ios->layout->num_comps /
604 ios->layout->mirrors_p1;
605 unsigned pages_in_stripe = stripes *
606 (ios->layout->stripe_unit / PAGE_SIZE);
607 unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
608 stripes;
609
610 if (BIO_MAX_PAGES_KMALLOC < bio_size)
611 bio_size = BIO_MAX_PAGES_KMALLOC;
612
613 per_dev->bio = bio_kmalloc(gfp_flags, bio_size);
614 if (unlikely(!per_dev->bio)) {
615 dprintk("Faild to allocate BIO size=%u\n", bio_size);
616 return -ENOMEM;
617 }
618 }
619
620 while (cur_len > 0) {
621 unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len);
622 unsigned added_len;
623
624 BUG_ON(ios->ol_state.nr_pages <= pg);
625 cur_len -= pglen;
626
627 added_len = bio_add_pc_page(q, per_dev->bio,
628 ios->ol_state.pages[pg], pglen, pgbase);
629 if (unlikely(pglen != added_len))
630 return -ENOMEM;
631 pgbase = 0;
632 ++pg;
633 }
634 BUG_ON(cur_len);
635
636 *cur_pg = pg;
637 return 0;
638}
639
640static int _prepare_one_group(struct objio_state *ios, u64 length,
641 struct _striping_info *si, unsigned *last_pg,
642 gfp_t gfp_flags)
643{
644 unsigned stripe_unit = ios->layout->stripe_unit;
645 unsigned mirrors_p1 = ios->layout->mirrors_p1;
646 unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
647 unsigned dev = si->dev;
648 unsigned first_dev = dev - (dev % devs_in_group);
649 unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
650 unsigned cur_pg = *last_pg;
651 int ret = 0;
652
653 while (length) {
654 struct _objio_per_comp *per_dev = &ios->per_dev[dev];
655 unsigned cur_len, page_off = 0;
656
657 if (!per_dev->length) {
658 per_dev->dev = dev;
659 if (dev < si->dev) {
660 per_dev->offset = si->obj_offset + stripe_unit -
661 si->unit_off;
662 cur_len = stripe_unit;
663 } else if (dev == si->dev) {
664 per_dev->offset = si->obj_offset;
665 cur_len = stripe_unit - si->unit_off;
666 page_off = si->unit_off & ~PAGE_MASK;
667 BUG_ON(page_off &&
668 (page_off != ios->ol_state.pgbase));
669 } else { /* dev > si->dev */
670 per_dev->offset = si->obj_offset - si->unit_off;
671 cur_len = stripe_unit;
672 }
673
674 if (max_comp < dev)
675 max_comp = dev;
676 } else {
677 cur_len = stripe_unit;
678 }
679 if (cur_len >= length)
680 cur_len = length;
681
682 ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev,
683 cur_len, gfp_flags);
684 if (unlikely(ret))
685 goto out;
686
687 dev += mirrors_p1;
688 dev = (dev % devs_in_group) + first_dev;
689
690 length -= cur_len;
691 ios->length += cur_len;
692 }
693out:
694 ios->numdevs = max_comp + mirrors_p1;
695 *last_pg = cur_pg;
696 return ret;
697}
698
699static int _io_rw_pagelist(struct objio_state *ios, gfp_t gfp_flags)
700{
701 u64 length = ios->ol_state.count;
702 u64 offset = ios->ol_state.offset;
703 struct _striping_info si;
704 unsigned last_pg = 0;
705 int ret = 0;
706
707 while (length) {
708 _calc_stripe_info(ios, offset, &si);
709
710 if (length < si.group_length)
711 si.group_length = length;
712
713 ret = _prepare_one_group(ios, si.group_length, &si, &last_pg, gfp_flags);
714 if (unlikely(ret))
715 goto out;
716
717 offset += si.group_length;
718 length -= si.group_length;
719 }
720
721out:
722 if (!ios->length)
723 return ret;
724
725 return 0;
726}
727
728static ssize_t _sync_done(struct objio_state *ios)
729{
730 struct completion *waiting = ios->private;
731
732 complete(waiting);
733 return 0;
734}
735
736static void _last_io(struct kref *kref)
737{
738 struct objio_state *ios = container_of(kref, struct objio_state, kref);
739
740 ios->done(ios);
741}
742
743static void _done_io(struct osd_request *or, void *p)
744{
745 struct objio_state *ios = p;
746
747 kref_put(&ios->kref, _last_io);
748}
749
750static ssize_t _io_exec(struct objio_state *ios)
751{
752 DECLARE_COMPLETION_ONSTACK(wait);
753 ssize_t status = 0; /* sync status */
754 unsigned i;
755 objio_done_fn saved_done_fn = ios->done;
756 bool sync = ios->ol_state.sync;
757
758 if (sync) {
759 ios->done = _sync_done;
760 ios->private = &wait;
761 }
762
763 kref_init(&ios->kref);
764
765 for (i = 0; i < ios->numdevs; i++) {
766 struct osd_request *or = ios->per_dev[i].or;
767
768 if (!or)
769 continue;
770
771 kref_get(&ios->kref);
772 osd_execute_request_async(or, _done_io, ios);
773 }
774
775 kref_put(&ios->kref, _last_io);
776
777 if (sync) {
778 wait_for_completion(&wait);
779 status = saved_done_fn(ios);
780 }
781
782 return status;
783}
784
785/*
786 * read
787 */
788static ssize_t _read_done(struct objio_state *ios)
789{
790 ssize_t status;
791 int ret = _io_check(ios, false);
792
793 _io_free(ios);
794
795 if (likely(!ret))
796 status = ios->length;
797 else
798 status = ret;
799
800 objlayout_read_done(&ios->ol_state, status, ios->ol_state.sync);
801 return status;
802}
803
804static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
805{
806 struct osd_request *or = NULL;
807 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
808 unsigned dev = per_dev->dev;
809 struct pnfs_osd_object_cred *cred =
810 &ios->layout->comps[dev];
811 struct osd_obj_id obj = {
812 .partition = cred->oc_object_id.oid_partition_id,
813 .id = cred->oc_object_id.oid_object_id,
814 };
815 int ret;
816
817 or = osd_start_request(_io_od(ios, dev), GFP_KERNEL);
818 if (unlikely(!or)) {
819 ret = -ENOMEM;
820 goto err;
821 }
822 per_dev->or = or;
823
824 osd_req_read(or, &obj, per_dev->offset, per_dev->bio, per_dev->length);
825
826 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
827 if (ret) {
828 dprintk("%s: Faild to osd_finalize_request() => %d\n",
829 __func__, ret);
830 goto err;
831 }
832
833 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
834 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
835 per_dev->length);
836
837err:
838 return ret;
839}
840
841static ssize_t _read_exec(struct objio_state *ios)
842{
843 unsigned i;
844 int ret;
845
846 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
847 if (!ios->per_dev[i].length)
848 continue;
849 ret = _read_mirrors(ios, i);
850 if (unlikely(ret))
851 goto err;
852 }
853
854 ios->done = _read_done;
855 return _io_exec(ios); /* In sync mode exec returns the io status */
856
857err:
858 _io_free(ios);
859 return ret;
860}
861
862ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state)
863{
864 struct objio_state *ios = container_of(ol_state, struct objio_state,
865 ol_state);
866 int ret;
867
868 ret = _io_rw_pagelist(ios, GFP_KERNEL);
869 if (unlikely(ret))
870 return ret;
871
872 return _read_exec(ios);
873}
874
875/*
876 * write
877 */
878static ssize_t _write_done(struct objio_state *ios)
879{
880 ssize_t status;
881 int ret = _io_check(ios, true);
882
883 _io_free(ios);
884
885 if (likely(!ret)) {
886 /* FIXME: should be based on the OSD's persistence model
887 * See OSD2r05 Section 4.13 Data persistence model */
888 ios->ol_state.committed = NFS_FILE_SYNC;
889 status = ios->length;
890 } else {
891 status = ret;
892 }
893
894 objlayout_write_done(&ios->ol_state, status, ios->ol_state.sync);
895 return status;
896}
897
898static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
899{
900 struct _objio_per_comp *master_dev = &ios->per_dev[cur_comp];
901 unsigned dev = ios->per_dev[cur_comp].dev;
902 unsigned last_comp = cur_comp + ios->layout->mirrors_p1;
903 int ret;
904
905 for (; cur_comp < last_comp; ++cur_comp, ++dev) {
906 struct osd_request *or = NULL;
907 struct pnfs_osd_object_cred *cred =
908 &ios->layout->comps[dev];
909 struct osd_obj_id obj = {
910 .partition = cred->oc_object_id.oid_partition_id,
911 .id = cred->oc_object_id.oid_object_id,
912 };
913 struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
914 struct bio *bio;
915
916 or = osd_start_request(_io_od(ios, dev), GFP_NOFS);
917 if (unlikely(!or)) {
918 ret = -ENOMEM;
919 goto err;
920 }
921 per_dev->or = or;
922
923 if (per_dev != master_dev) {
924 bio = bio_kmalloc(GFP_NOFS,
925 master_dev->bio->bi_max_vecs);
926 if (unlikely(!bio)) {
927 dprintk("Faild to allocate BIO size=%u\n",
928 master_dev->bio->bi_max_vecs);
929 ret = -ENOMEM;
930 goto err;
931 }
932
933 __bio_clone(bio, master_dev->bio);
934 bio->bi_bdev = NULL;
935 bio->bi_next = NULL;
936 per_dev->bio = bio;
937 per_dev->dev = dev;
938 per_dev->length = master_dev->length;
939 per_dev->offset = master_dev->offset;
940 } else {
941 bio = master_dev->bio;
942 bio->bi_rw |= REQ_WRITE;
943 }
944
945 osd_req_write(or, &obj, per_dev->offset, bio, per_dev->length);
946
947 ret = osd_finalize_request(or, 0, cred->oc_cap.cred, NULL);
948 if (ret) {
949 dprintk("%s: Faild to osd_finalize_request() => %d\n",
950 __func__, ret);
951 goto err;
952 }
953
954 dprintk("%s:[%d] dev=%d obj=0x%llx start=0x%llx length=0x%lx\n",
955 __func__, cur_comp, dev, obj.id, _LLU(per_dev->offset),
956 per_dev->length);
957 }
958
959err:
960 return ret;
961}
962
963static ssize_t _write_exec(struct objio_state *ios)
964{
965 unsigned i;
966 int ret;
967
968 for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) {
969 if (!ios->per_dev[i].length)
970 continue;
971 ret = _write_mirrors(ios, i);
972 if (unlikely(ret))
973 goto err;
974 }
975
976 ios->done = _write_done;
977 return _io_exec(ios); /* In sync mode exec returns the io->status */
978
979err:
980 _io_free(ios);
981 return ret;
982}
983
984ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state, bool stable)
985{
986 struct objio_state *ios = container_of(ol_state, struct objio_state,
987 ol_state);
988 int ret;
989
990 /* TODO: ios->stable = stable; */
991 ret = _io_rw_pagelist(ios, GFP_NOFS);
992 if (unlikely(ret))
993 return ret;
994
995 return _write_exec(ios);
996}
997
998static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
999 struct nfs_page *prev, struct nfs_page *req)
1000{
1001 if (!pnfs_generic_pg_test(pgio, prev, req))
1002 return false;
1003
1004 return pgio->pg_count + req->wb_bytes <=
1005 OBJIO_LSEG(pgio->pg_lseg)->max_io_size;
1006}
1007
1008static struct pnfs_layoutdriver_type objlayout_type = {
1009 .id = LAYOUT_OSD2_OBJECTS,
1010 .name = "LAYOUT_OSD2_OBJECTS",
1011 .flags = PNFS_LAYOUTRET_ON_SETATTR,
1012
1013 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
1014 .free_layout_hdr = objlayout_free_layout_hdr,
1015
1016 .alloc_lseg = objlayout_alloc_lseg,
1017 .free_lseg = objlayout_free_lseg,
1018
1019 .read_pagelist = objlayout_read_pagelist,
1020 .write_pagelist = objlayout_write_pagelist,
1021 .pg_test = objio_pg_test,
1022
1023 .free_deviceid_node = objio_free_deviceid_node,
1024
1025 .encode_layoutcommit = objlayout_encode_layoutcommit,
1026 .encode_layoutreturn = objlayout_encode_layoutreturn,
1027};
1028
1029MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects");
1030MODULE_AUTHOR("Benny Halevy <bhalevy@panasas.com>");
1031MODULE_LICENSE("GPL");
1032
1033static int __init
1034objlayout_init(void)
1035{
1036 int ret = pnfs_register_layoutdriver(&objlayout_type);
1037
1038 if (ret)
1039 printk(KERN_INFO
1040 "%s: Registering OSD pNFS Layout Driver failed: error=%d\n",
1041 __func__, ret);
1042 else
1043 printk(KERN_INFO "%s: Registered OSD pNFS Layout Driver\n",
1044 __func__);
1045 return ret;
1046}
1047
1048static void __exit
1049objlayout_exit(void)
1050{
1051 pnfs_unregister_layoutdriver(&objlayout_type);
1052 printk(KERN_INFO "%s: Unregistered OSD pNFS Layout Driver\n",
1053 __func__);
1054}
1055
1056module_init(objlayout_init);
1057module_exit(objlayout_exit);
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
new file mode 100644
index 000000000000..dc3956c0de80
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.c
@@ -0,0 +1,712 @@
1/*
2 * pNFS Objects layout driver high level definitions
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <scsi/osd_initiator.h>
41#include "objlayout.h"
42
43#define NFSDBG_FACILITY NFSDBG_PNFS_LD
44/*
45 * Create a objlayout layout structure for the given inode and return it.
46 */
47struct pnfs_layout_hdr *
48objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
49{
50 struct objlayout *objlay;
51
52 objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
53 if (objlay) {
54 spin_lock_init(&objlay->lock);
55 INIT_LIST_HEAD(&objlay->err_list);
56 }
57 dprintk("%s: Return %p\n", __func__, objlay);
58 return &objlay->pnfs_layout;
59}
60
61/*
62 * Free an objlayout layout structure
63 */
64void
65objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
66{
67 struct objlayout *objlay = OBJLAYOUT(lo);
68
69 dprintk("%s: objlay %p\n", __func__, objlay);
70
71 WARN_ON(!list_empty(&objlay->err_list));
72 kfree(objlay);
73}
74
75/*
76 * Unmarshall layout and store it in pnfslay.
77 */
78struct pnfs_layout_segment *
79objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay,
80 struct nfs4_layoutget_res *lgr,
81 gfp_t gfp_flags)
82{
83 int status = -ENOMEM;
84 struct xdr_stream stream;
85 struct xdr_buf buf = {
86 .pages = lgr->layoutp->pages,
87 .page_len = lgr->layoutp->len,
88 .buflen = lgr->layoutp->len,
89 .len = lgr->layoutp->len,
90 };
91 struct page *scratch;
92 struct pnfs_layout_segment *lseg;
93
94 dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay);
95
96 scratch = alloc_page(gfp_flags);
97 if (!scratch)
98 goto err_nofree;
99
100 xdr_init_decode(&stream, &buf, NULL);
101 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
102
103 status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags);
104 if (unlikely(status)) {
105 dprintk("%s: objio_alloc_lseg Return err %d\n", __func__,
106 status);
107 goto err;
108 }
109
110 __free_page(scratch);
111
112 dprintk("%s: Return %p\n", __func__, lseg);
113 return lseg;
114
115err:
116 __free_page(scratch);
117err_nofree:
118 dprintk("%s: Err Return=>%d\n", __func__, status);
119 return ERR_PTR(status);
120}
121
122/*
123 * Free a layout segement
124 */
125void
126objlayout_free_lseg(struct pnfs_layout_segment *lseg)
127{
128 dprintk("%s: freeing layout segment %p\n", __func__, lseg);
129
130 if (unlikely(!lseg))
131 return;
132
133 objio_free_lseg(lseg);
134}
135
136/*
137 * I/O Operations
138 */
139static inline u64
140end_offset(u64 start, u64 len)
141{
142 u64 end;
143
144 end = start + len;
145 return end >= start ? end : NFS4_MAX_UINT64;
146}
147
148/* last octet in a range */
149static inline u64
150last_byte_offset(u64 start, u64 len)
151{
152 u64 end;
153
154 BUG_ON(!len);
155 end = start + len;
156 return end > start ? end - 1 : NFS4_MAX_UINT64;
157}
158
159static struct objlayout_io_state *
160objlayout_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type,
161 struct page **pages,
162 unsigned pgbase,
163 loff_t offset,
164 size_t count,
165 struct pnfs_layout_segment *lseg,
166 void *rpcdata,
167 gfp_t gfp_flags)
168{
169 struct objlayout_io_state *state;
170 u64 lseg_end_offset;
171
172 dprintk("%s: allocating io_state\n", __func__);
173 if (objio_alloc_io_state(lseg, &state, gfp_flags))
174 return NULL;
175
176 BUG_ON(offset < lseg->pls_range.offset);
177 lseg_end_offset = end_offset(lseg->pls_range.offset,
178 lseg->pls_range.length);
179 BUG_ON(offset >= lseg_end_offset);
180 if (offset + count > lseg_end_offset) {
181 count = lseg->pls_range.length -
182 (offset - lseg->pls_range.offset);
183 dprintk("%s: truncated count %Zd\n", __func__, count);
184 }
185
186 if (pgbase > PAGE_SIZE) {
187 pages += pgbase >> PAGE_SHIFT;
188 pgbase &= ~PAGE_MASK;
189 }
190
191 INIT_LIST_HEAD(&state->err_list);
192 state->lseg = lseg;
193 state->rpcdata = rpcdata;
194 state->pages = pages;
195 state->pgbase = pgbase;
196 state->nr_pages = (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT;
197 state->offset = offset;
198 state->count = count;
199 state->sync = 0;
200
201 return state;
202}
203
204static void
205objlayout_free_io_state(struct objlayout_io_state *state)
206{
207 dprintk("%s: freeing io_state\n", __func__);
208 if (unlikely(!state))
209 return;
210
211 objio_free_io_state(state);
212}
213
214/*
215 * I/O done common code
216 */
217static void
218objlayout_iodone(struct objlayout_io_state *state)
219{
220 dprintk("%s: state %p status\n", __func__, state);
221
222 if (likely(state->status >= 0)) {
223 objlayout_free_io_state(state);
224 } else {
225 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
226
227 spin_lock(&objlay->lock);
228 objlay->delta_space_valid = OBJ_DSU_INVALID;
229 list_add(&objlay->err_list, &state->err_list);
230 spin_unlock(&objlay->lock);
231 }
232}
233
234/*
235 * objlayout_io_set_result - Set an osd_error code on a specific osd comp.
236 *
237 * The @index component IO failed (error returned from target). Register
238 * the error for later reporting at layout-return.
239 */
240void
241objlayout_io_set_result(struct objlayout_io_state *state, unsigned index,
242 struct pnfs_osd_objid *pooid, int osd_error,
243 u64 offset, u64 length, bool is_write)
244{
245 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[index];
246
247 BUG_ON(index >= state->num_comps);
248 if (osd_error) {
249 ioerr->oer_component = *pooid;
250 ioerr->oer_comp_offset = offset;
251 ioerr->oer_comp_length = length;
252 ioerr->oer_iswrite = is_write;
253 ioerr->oer_errno = osd_error;
254
255 dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) "
256 "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n",
257 __func__, index, ioerr->oer_errno,
258 ioerr->oer_iswrite,
259 _DEVID_LO(&ioerr->oer_component.oid_device_id),
260 _DEVID_HI(&ioerr->oer_component.oid_device_id),
261 ioerr->oer_component.oid_partition_id,
262 ioerr->oer_component.oid_object_id,
263 ioerr->oer_comp_offset,
264 ioerr->oer_comp_length);
265 } else {
266 /* User need not call if no error is reported */
267 ioerr->oer_errno = 0;
268 }
269}
270
271/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete().
272 * This is because the osd completion is called with ints-off from
273 * the block layer
274 */
275static void _rpc_read_complete(struct work_struct *work)
276{
277 struct rpc_task *task;
278 struct nfs_read_data *rdata;
279
280 dprintk("%s enter\n", __func__);
281 task = container_of(work, struct rpc_task, u.tk_work);
282 rdata = container_of(task, struct nfs_read_data, task);
283
284 pnfs_ld_read_done(rdata);
285}
286
287void
288objlayout_read_done(struct objlayout_io_state *state, ssize_t status, bool sync)
289{
290 int eof = state->eof;
291 struct nfs_read_data *rdata;
292
293 state->status = status;
294 dprintk("%s: Begin status=%ld eof=%d\n", __func__, status, eof);
295 rdata = state->rpcdata;
296 rdata->task.tk_status = status;
297 if (status >= 0) {
298 rdata->res.count = status;
299 rdata->res.eof = eof;
300 }
301 objlayout_iodone(state);
302 /* must not use state after this point */
303
304 if (sync)
305 pnfs_ld_read_done(rdata);
306 else {
307 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete);
308 schedule_work(&rdata->task.u.tk_work);
309 }
310}
311
312/*
313 * Perform sync or async reads.
314 */
315enum pnfs_try_status
316objlayout_read_pagelist(struct nfs_read_data *rdata)
317{
318 loff_t offset = rdata->args.offset;
319 size_t count = rdata->args.count;
320 struct objlayout_io_state *state;
321 ssize_t status = 0;
322 loff_t eof;
323
324 dprintk("%s: Begin inode %p offset %llu count %d\n",
325 __func__, rdata->inode, offset, (int)count);
326
327 eof = i_size_read(rdata->inode);
328 if (unlikely(offset + count > eof)) {
329 if (offset >= eof) {
330 status = 0;
331 rdata->res.count = 0;
332 rdata->res.eof = 1;
333 goto out;
334 }
335 count = eof - offset;
336 }
337
338 state = objlayout_alloc_io_state(NFS_I(rdata->inode)->layout,
339 rdata->args.pages, rdata->args.pgbase,
340 offset, count,
341 rdata->lseg, rdata,
342 GFP_KERNEL);
343 if (unlikely(!state)) {
344 status = -ENOMEM;
345 goto out;
346 }
347
348 state->eof = state->offset + state->count >= eof;
349
350 status = objio_read_pagelist(state);
351 out:
352 dprintk("%s: Return status %Zd\n", __func__, status);
353 rdata->pnfs_error = status;
354 return PNFS_ATTEMPTED;
355}
356
357/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete().
358 * This is because the osd completion is called with ints-off from
359 * the block layer
360 */
361static void _rpc_write_complete(struct work_struct *work)
362{
363 struct rpc_task *task;
364 struct nfs_write_data *wdata;
365
366 dprintk("%s enter\n", __func__);
367 task = container_of(work, struct rpc_task, u.tk_work);
368 wdata = container_of(task, struct nfs_write_data, task);
369
370 pnfs_ld_write_done(wdata);
371}
372
373void
374objlayout_write_done(struct objlayout_io_state *state, ssize_t status,
375 bool sync)
376{
377 struct nfs_write_data *wdata;
378
379 dprintk("%s: Begin\n", __func__);
380 wdata = state->rpcdata;
381 state->status = status;
382 wdata->task.tk_status = status;
383 if (status >= 0) {
384 wdata->res.count = status;
385 wdata->verf.committed = state->committed;
386 dprintk("%s: Return status %d committed %d\n",
387 __func__, wdata->task.tk_status,
388 wdata->verf.committed);
389 } else
390 dprintk("%s: Return status %d\n",
391 __func__, wdata->task.tk_status);
392 objlayout_iodone(state);
393 /* must not use state after this point */
394
395 if (sync)
396 pnfs_ld_write_done(wdata);
397 else {
398 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete);
399 schedule_work(&wdata->task.u.tk_work);
400 }
401}
402
403/*
404 * Perform sync or async writes.
405 */
406enum pnfs_try_status
407objlayout_write_pagelist(struct nfs_write_data *wdata,
408 int how)
409{
410 struct objlayout_io_state *state;
411 ssize_t status;
412
413 dprintk("%s: Begin inode %p offset %llu count %u\n",
414 __func__, wdata->inode, wdata->args.offset, wdata->args.count);
415
416 state = objlayout_alloc_io_state(NFS_I(wdata->inode)->layout,
417 wdata->args.pages,
418 wdata->args.pgbase,
419 wdata->args.offset,
420 wdata->args.count,
421 wdata->lseg, wdata,
422 GFP_NOFS);
423 if (unlikely(!state)) {
424 status = -ENOMEM;
425 goto out;
426 }
427
428 state->sync = how & FLUSH_SYNC;
429
430 status = objio_write_pagelist(state, how & FLUSH_STABLE);
431 out:
432 dprintk("%s: Return status %Zd\n", __func__, status);
433 wdata->pnfs_error = status;
434 return PNFS_ATTEMPTED;
435}
436
437void
438objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay,
439 struct xdr_stream *xdr,
440 const struct nfs4_layoutcommit_args *args)
441{
442 struct objlayout *objlay = OBJLAYOUT(pnfslay);
443 struct pnfs_osd_layoutupdate lou;
444 __be32 *start;
445
446 dprintk("%s: Begin\n", __func__);
447
448 spin_lock(&objlay->lock);
449 lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID);
450 lou.dsu_delta = objlay->delta_space_used;
451 objlay->delta_space_used = 0;
452 objlay->delta_space_valid = OBJ_DSU_INIT;
453 lou.olu_ioerr_flag = !list_empty(&objlay->err_list);
454 spin_unlock(&objlay->lock);
455
456 start = xdr_reserve_space(xdr, 4);
457
458 BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou));
459
460 *start = cpu_to_be32((xdr->p - start - 1) * 4);
461
462 dprintk("%s: Return delta_space_used %lld err %d\n", __func__,
463 lou.dsu_delta, lou.olu_ioerr_flag);
464}
465
466static int
467err_prio(u32 oer_errno)
468{
469 switch (oer_errno) {
470 case 0:
471 return 0;
472
473 case PNFS_OSD_ERR_RESOURCE:
474 return OSD_ERR_PRI_RESOURCE;
475 case PNFS_OSD_ERR_BAD_CRED:
476 return OSD_ERR_PRI_BAD_CRED;
477 case PNFS_OSD_ERR_NO_ACCESS:
478 return OSD_ERR_PRI_NO_ACCESS;
479 case PNFS_OSD_ERR_UNREACHABLE:
480 return OSD_ERR_PRI_UNREACHABLE;
481 case PNFS_OSD_ERR_NOT_FOUND:
482 return OSD_ERR_PRI_NOT_FOUND;
483 case PNFS_OSD_ERR_NO_SPACE:
484 return OSD_ERR_PRI_NO_SPACE;
485 default:
486 WARN_ON(1);
487 /* fallthrough */
488 case PNFS_OSD_ERR_EIO:
489 return OSD_ERR_PRI_EIO;
490 }
491}
492
493static void
494merge_ioerr(struct pnfs_osd_ioerr *dest_err,
495 const struct pnfs_osd_ioerr *src_err)
496{
497 u64 dest_end, src_end;
498
499 if (!dest_err->oer_errno) {
500 *dest_err = *src_err;
501 /* accumulated device must be blank */
502 memset(&dest_err->oer_component.oid_device_id, 0,
503 sizeof(dest_err->oer_component.oid_device_id));
504
505 return;
506 }
507
508 if (dest_err->oer_component.oid_partition_id !=
509 src_err->oer_component.oid_partition_id)
510 dest_err->oer_component.oid_partition_id = 0;
511
512 if (dest_err->oer_component.oid_object_id !=
513 src_err->oer_component.oid_object_id)
514 dest_err->oer_component.oid_object_id = 0;
515
516 if (dest_err->oer_comp_offset > src_err->oer_comp_offset)
517 dest_err->oer_comp_offset = src_err->oer_comp_offset;
518
519 dest_end = end_offset(dest_err->oer_comp_offset,
520 dest_err->oer_comp_length);
521 src_end = end_offset(src_err->oer_comp_offset,
522 src_err->oer_comp_length);
523 if (dest_end < src_end)
524 dest_end = src_end;
525
526 dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset;
527
528 if ((src_err->oer_iswrite == dest_err->oer_iswrite) &&
529 (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) {
530 dest_err->oer_errno = src_err->oer_errno;
531 } else if (src_err->oer_iswrite) {
532 dest_err->oer_iswrite = true;
533 dest_err->oer_errno = src_err->oer_errno;
534 }
535}
536
537static void
538encode_accumulated_error(struct objlayout *objlay, __be32 *p)
539{
540 struct objlayout_io_state *state, *tmp;
541 struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0};
542
543 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
544 unsigned i;
545
546 for (i = 0; i < state->num_comps; i++) {
547 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
548
549 if (!ioerr->oer_errno)
550 continue;
551
552 printk(KERN_ERR "%s: err[%d]: errno=%d is_write=%d "
553 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
554 "offset=0x%llx length=0x%llx\n",
555 __func__, i, ioerr->oer_errno,
556 ioerr->oer_iswrite,
557 _DEVID_LO(&ioerr->oer_component.oid_device_id),
558 _DEVID_HI(&ioerr->oer_component.oid_device_id),
559 ioerr->oer_component.oid_partition_id,
560 ioerr->oer_component.oid_object_id,
561 ioerr->oer_comp_offset,
562 ioerr->oer_comp_length);
563
564 merge_ioerr(&accumulated_err, ioerr);
565 }
566 list_del(&state->err_list);
567 objlayout_free_io_state(state);
568 }
569
570 pnfs_osd_xdr_encode_ioerr(p, &accumulated_err);
571}
572
573void
574objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
575 struct xdr_stream *xdr,
576 const struct nfs4_layoutreturn_args *args)
577{
578 struct objlayout *objlay = OBJLAYOUT(pnfslay);
579 struct objlayout_io_state *state, *tmp;
580 __be32 *start;
581
582 dprintk("%s: Begin\n", __func__);
583 start = xdr_reserve_space(xdr, 4);
584 BUG_ON(!start);
585
586 spin_lock(&objlay->lock);
587
588 list_for_each_entry_safe(state, tmp, &objlay->err_list, err_list) {
589 __be32 *last_xdr = NULL, *p;
590 unsigned i;
591 int res = 0;
592
593 for (i = 0; i < state->num_comps; i++) {
594 struct pnfs_osd_ioerr *ioerr = &state->ioerrs[i];
595
596 if (!ioerr->oer_errno)
597 continue;
598
599 dprintk("%s: err[%d]: errno=%d is_write=%d "
600 "dev(%llx:%llx) par=0x%llx obj=0x%llx "
601 "offset=0x%llx length=0x%llx\n",
602 __func__, i, ioerr->oer_errno,
603 ioerr->oer_iswrite,
604 _DEVID_LO(&ioerr->oer_component.oid_device_id),
605 _DEVID_HI(&ioerr->oer_component.oid_device_id),
606 ioerr->oer_component.oid_partition_id,
607 ioerr->oer_component.oid_object_id,
608 ioerr->oer_comp_offset,
609 ioerr->oer_comp_length);
610
611 p = pnfs_osd_xdr_ioerr_reserve_space(xdr);
612 if (unlikely(!p)) {
613 res = -E2BIG;
614 break; /* accumulated_error */
615 }
616
617 last_xdr = p;
618 pnfs_osd_xdr_encode_ioerr(p, &state->ioerrs[i]);
619 }
620
621 /* TODO: use xdr_write_pages */
622 if (unlikely(res)) {
623 /* no space for even one error descriptor */
624 BUG_ON(!last_xdr);
625
626 /* we've encountered a situation with lots and lots of
627 * errors and no space to encode them all. Use the last
628 * available slot to report the union of all the
629 * remaining errors.
630 */
631 encode_accumulated_error(objlay, last_xdr);
632 goto loop_done;
633 }
634 list_del(&state->err_list);
635 objlayout_free_io_state(state);
636 }
637loop_done:
638 spin_unlock(&objlay->lock);
639
640 *start = cpu_to_be32((xdr->p - start - 1) * 4);
641 dprintk("%s: Return\n", __func__);
642}
643
644
645/*
646 * Get Device Info API for io engines
647 */
648struct objlayout_deviceinfo {
649 struct page *page;
650 struct pnfs_osd_deviceaddr da; /* This must be last */
651};
652
653/* Initialize and call nfs_getdeviceinfo, then decode and return a
654 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
655 * should be called.
656 */
657int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
658 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
659 gfp_t gfp_flags)
660{
661 struct objlayout_deviceinfo *odi;
662 struct pnfs_device pd;
663 struct super_block *sb;
664 struct page *page, **pages;
665 u32 *p;
666 int err;
667
668 page = alloc_page(gfp_flags);
669 if (!page)
670 return -ENOMEM;
671
672 pages = &page;
673 pd.pages = pages;
674
675 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
676 pd.layout_type = LAYOUT_OSD2_OBJECTS;
677 pd.pages = &page;
678 pd.pgbase = 0;
679 pd.pglen = PAGE_SIZE;
680 pd.mincount = 0;
681
682 sb = pnfslay->plh_inode->i_sb;
683 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd);
684 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
685 if (err)
686 goto err_out;
687
688 p = page_address(page);
689 odi = kzalloc(sizeof(*odi), gfp_flags);
690 if (!odi) {
691 err = -ENOMEM;
692 goto err_out;
693 }
694 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
695 odi->page = page;
696 *deviceaddr = &odi->da;
697 return 0;
698
699err_out:
700 __free_page(page);
701 return err;
702}
703
704void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
705{
706 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
707 struct objlayout_deviceinfo,
708 da);
709
710 __free_page(odi->page);
711 kfree(odi);
712}
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
new file mode 100644
index 000000000000..a8244c8e042d
--- /dev/null
+++ b/fs/nfs/objlayout/objlayout.h
@@ -0,0 +1,187 @@
1/*
2 * Data types and function declerations for interfacing with the
3 * pNFS standard object layout driver.
4 *
5 * Copyright (C) 2007 Panasas Inc. [year of first publication]
6 * All rights reserved.
7 *
8 * Benny Halevy <bhalevy@panasas.com>
9 * Boaz Harrosh <bharrosh@panasas.com>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2
13 * See the file COPYING included with this distribution for more details.
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 *
19 * 1. Redistributions of source code must retain the above copyright
20 * notice, this list of conditions and the following disclaimer.
21 * 2. Redistributions in binary form must reproduce the above copyright
22 * notice, this list of conditions and the following disclaimer in the
23 * documentation and/or other materials provided with the distribution.
24 * 3. Neither the name of the Panasas company nor the names of its
25 * contributors may be used to endorse or promote products derived
26 * from this software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
29 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
30 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
31 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
35 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
36 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
37 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 */
40
41#ifndef _OBJLAYOUT_H
42#define _OBJLAYOUT_H
43
44#include <linux/nfs_fs.h>
45#include <linux/pnfs_osd_xdr.h>
46#include "../pnfs.h"
47
48/*
49 * per-inode layout
50 */
51struct objlayout {
52 struct pnfs_layout_hdr pnfs_layout;
53
54 /* for layout_commit */
55 enum osd_delta_space_valid_enum {
56 OBJ_DSU_INIT = 0,
57 OBJ_DSU_VALID,
58 OBJ_DSU_INVALID,
59 } delta_space_valid;
60 s64 delta_space_used; /* consumed by write ops */
61
62 /* for layout_return */
63 spinlock_t lock;
64 struct list_head err_list;
65};
66
67static inline struct objlayout *
68OBJLAYOUT(struct pnfs_layout_hdr *lo)
69{
70 return container_of(lo, struct objlayout, pnfs_layout);
71}
72
73/*
74 * per-I/O operation state
75 * embedded in objects provider io_state data structure
76 */
77struct objlayout_io_state {
78 struct pnfs_layout_segment *lseg;
79
80 struct page **pages;
81 unsigned pgbase;
82 unsigned nr_pages;
83 unsigned long count;
84 loff_t offset;
85 bool sync;
86
87 void *rpcdata;
88 int status; /* res */
89 int eof; /* res */
90 int committed; /* res */
91
92 /* Error reporting (layout_return) */
93 struct list_head err_list;
94 unsigned num_comps;
95 /* Pointer to array of error descriptors of size num_comps.
96 * It should contain as many entries as devices in the osd_layout
97 * that participate in the I/O. It is up to the io_engine to allocate
98 * needed space and set num_comps.
99 */
100 struct pnfs_osd_ioerr *ioerrs;
101};
102
103/*
104 * Raid engine I/O API
105 */
106extern int objio_alloc_lseg(struct pnfs_layout_segment **outp,
107 struct pnfs_layout_hdr *pnfslay,
108 struct pnfs_layout_range *range,
109 struct xdr_stream *xdr,
110 gfp_t gfp_flags);
111extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
112
113extern int objio_alloc_io_state(
114 struct pnfs_layout_segment *lseg,
115 struct objlayout_io_state **outp,
116 gfp_t gfp_flags);
117extern void objio_free_io_state(struct objlayout_io_state *state);
118
119extern ssize_t objio_read_pagelist(struct objlayout_io_state *ol_state);
120extern ssize_t objio_write_pagelist(struct objlayout_io_state *ol_state,
121 bool stable);
122
123/*
124 * callback API
125 */
126extern void objlayout_io_set_result(struct objlayout_io_state *state,
127 unsigned index, struct pnfs_osd_objid *pooid,
128 int osd_error, u64 offset, u64 length, bool is_write);
129
130static inline void
131objlayout_add_delta_space_used(struct objlayout_io_state *state, s64 space_used)
132{
133 struct objlayout *objlay = OBJLAYOUT(state->lseg->pls_layout);
134
135 /* If one of the I/Os errored out and the delta_space_used was
136 * invalid we render the complete report as invalid. Protocol mandate
137 * the DSU be accurate or not reported.
138 */
139 spin_lock(&objlay->lock);
140 if (objlay->delta_space_valid != OBJ_DSU_INVALID) {
141 objlay->delta_space_valid = OBJ_DSU_VALID;
142 objlay->delta_space_used += space_used;
143 }
144 spin_unlock(&objlay->lock);
145}
146
147extern void objlayout_read_done(struct objlayout_io_state *state,
148 ssize_t status, bool sync);
149extern void objlayout_write_done(struct objlayout_io_state *state,
150 ssize_t status, bool sync);
151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/*
158 * exported generic objects function vectors
159 */
160
161extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags);
162extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *);
163
164extern struct pnfs_layout_segment *objlayout_alloc_lseg(
165 struct pnfs_layout_hdr *,
166 struct nfs4_layoutget_res *,
167 gfp_t gfp_flags);
168extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169
170extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_read_data *);
172
173extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_write_data *,
175 int how);
176
177extern void objlayout_encode_layoutcommit(
178 struct pnfs_layout_hdr *,
179 struct xdr_stream *,
180 const struct nfs4_layoutcommit_args *);
181
182extern void objlayout_encode_layoutreturn(
183 struct pnfs_layout_hdr *,
184 struct xdr_stream *,
185 const struct nfs4_layoutreturn_args *);
186
187#endif /* _OBJLAYOUT_H */
diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
new file mode 100644
index 000000000000..16fc758e9123
--- /dev/null
+++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c
@@ -0,0 +1,412 @@
1/*
2 * Object-Based pNFS Layout XDR layer
3 *
4 * Copyright (C) 2007 Panasas Inc. [year of first publication]
5 * All rights reserved.
6 *
7 * Benny Halevy <bhalevy@panasas.com>
8 * Boaz Harrosh <bharrosh@panasas.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2
12 * See the file COPYING included with this distribution for more details.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 *
18 * 1. Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * 2. Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in the
22 * documentation and/or other materials provided with the distribution.
23 * 3. Neither the name of the Panasas company nor the names of its
24 * contributors may be used to endorse or promote products derived
25 * from this software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
28 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
29 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
30 * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
34 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 */
39
40#include <linux/pnfs_osd_xdr.h>
41
42#define NFSDBG_FACILITY NFSDBG_PNFS_LD
43
44/*
45 * The following implementation is based on RFC5664
46 */
47
48/*
49 * struct pnfs_osd_objid {
50 * struct nfs4_deviceid oid_device_id;
51 * u64 oid_partition_id;
52 * u64 oid_object_id;
53 * }; // xdr size 32 bytes
54 */
55static __be32 *
56_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid)
57{
58 p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data,
59 sizeof(objid->oid_device_id.data));
60
61 p = xdr_decode_hyper(p, &objid->oid_partition_id);
62 p = xdr_decode_hyper(p, &objid->oid_object_id);
63 return p;
64}
65/*
66 * struct pnfs_osd_opaque_cred {
67 * u32 cred_len;
68 * void *cred;
69 * }; // xdr size [variable]
70 * The return pointers are from the xdr buffer
71 */
72static int
73_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred,
74 struct xdr_stream *xdr)
75{
76 __be32 *p = xdr_inline_decode(xdr, 1);
77
78 if (!p)
79 return -EINVAL;
80
81 opaque_cred->cred_len = be32_to_cpu(*p++);
82
83 p = xdr_inline_decode(xdr, opaque_cred->cred_len);
84 if (!p)
85 return -EINVAL;
86
87 opaque_cred->cred = p;
88 return 0;
89}
90
91/*
92 * struct pnfs_osd_object_cred {
93 * struct pnfs_osd_objid oc_object_id;
94 * u32 oc_osd_version;
95 * u32 oc_cap_key_sec;
96 * struct pnfs_osd_opaque_cred oc_cap_key
97 * struct pnfs_osd_opaque_cred oc_cap;
98 * }; // xdr size 32 + 4 + 4 + [variable] + [variable]
99 */
100static int
101_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp,
102 struct xdr_stream *xdr)
103{
104 __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4);
105 int ret;
106
107 if (!p)
108 return -EIO;
109
110 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
111 comp->oc_osd_version = be32_to_cpup(p++);
112 comp->oc_cap_key_sec = be32_to_cpup(p);
113
114 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr);
115 if (unlikely(ret))
116 return ret;
117
118 ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr);
119 return ret;
120}
121
122/*
123 * struct pnfs_osd_data_map {
124 * u32 odm_num_comps;
125 * u64 odm_stripe_unit;
126 * u32 odm_group_width;
127 * u32 odm_group_depth;
128 * u32 odm_mirror_cnt;
129 * u32 odm_raid_algorithm;
130 * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4
131 */
132static inline int
133_osd_data_map_xdr_sz(void)
134{
135 return 4 + 8 + 4 + 4 + 4 + 4;
136}
137
138static __be32 *
139_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map)
140{
141 data_map->odm_num_comps = be32_to_cpup(p++);
142 p = xdr_decode_hyper(p, &data_map->odm_stripe_unit);
143 data_map->odm_group_width = be32_to_cpup(p++);
144 data_map->odm_group_depth = be32_to_cpup(p++);
145 data_map->odm_mirror_cnt = be32_to_cpup(p++);
146 data_map->odm_raid_algorithm = be32_to_cpup(p++);
147 dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u "
148 "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n",
149 __func__,
150 data_map->odm_num_comps,
151 (unsigned long long)data_map->odm_stripe_unit,
152 data_map->odm_group_width,
153 data_map->odm_group_depth,
154 data_map->odm_mirror_cnt,
155 data_map->odm_raid_algorithm);
156 return p;
157}
158
159int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
160 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr)
161{
162 __be32 *p;
163
164 memset(iter, 0, sizeof(*iter));
165
166 p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4);
167 if (unlikely(!p))
168 return -EINVAL;
169
170 p = _osd_xdr_decode_data_map(p, &layout->olo_map);
171 layout->olo_comps_index = be32_to_cpup(p++);
172 layout->olo_num_comps = be32_to_cpup(p++);
173 iter->total_comps = layout->olo_num_comps;
174 return 0;
175}
176
177bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp,
178 struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr,
179 int *err)
180{
181 BUG_ON(iter->decoded_comps > iter->total_comps);
182 if (iter->decoded_comps == iter->total_comps)
183 return false;
184
185 *err = _osd_xdr_decode_object_cred(comp, xdr);
186 if (unlikely(*err)) {
187 dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d "
188 "total_comps=%d\n", __func__, *err,
189 iter->decoded_comps, iter->total_comps);
190 return false; /* stop the loop */
191 }
192 dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx "
193 "key_len=%u cap_len=%u\n",
194 __func__,
195 _DEVID_LO(&comp->oc_object_id.oid_device_id),
196 _DEVID_HI(&comp->oc_object_id.oid_device_id),
197 comp->oc_object_id.oid_partition_id,
198 comp->oc_object_id.oid_object_id,
199 comp->oc_cap_key.cred_len, comp->oc_cap.cred_len);
200
201 iter->decoded_comps++;
202 return true;
203}
204
205/*
206 * Get Device Information Decoding
207 *
208 * Note: since Device Information is currently done synchronously, all
209 * variable strings fields are left inside the rpc buffer and are only
210 * pointed to by the pnfs_osd_deviceaddr members. So the read buffer
211 * should not be freed while the returned information is in use.
212 */
213/*
214 *struct nfs4_string {
215 * unsigned int len;
216 * char *data;
217 *}; // size [variable]
218 * NOTE: Returned string points to inside the XDR buffer
219 */
220static __be32 *
221__read_u8_opaque(__be32 *p, struct nfs4_string *str)
222{
223 str->len = be32_to_cpup(p++);
224 str->data = (char *)p;
225
226 p += XDR_QUADLEN(str->len);
227 return p;
228}
229
230/*
231 * struct pnfs_osd_targetid {
232 * u32 oti_type;
233 * struct nfs4_string oti_scsi_device_id;
234 * };// size 4 + [variable]
235 */
236static __be32 *
237__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid)
238{
239 u32 oti_type;
240
241 oti_type = be32_to_cpup(p++);
242 targetid->oti_type = oti_type;
243
244 switch (oti_type) {
245 case OBJ_TARGET_SCSI_NAME:
246 case OBJ_TARGET_SCSI_DEVICE_ID:
247 p = __read_u8_opaque(p, &targetid->oti_scsi_device_id);
248 }
249
250 return p;
251}
252
253/*
254 * struct pnfs_osd_net_addr {
255 * struct nfs4_string r_netid;
256 * struct nfs4_string r_addr;
257 * };
258 */
259static __be32 *
260__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr)
261{
262 p = __read_u8_opaque(p, &netaddr->r_netid);
263 p = __read_u8_opaque(p, &netaddr->r_addr);
264
265 return p;
266}
267
268/*
269 * struct pnfs_osd_targetaddr {
270 * u32 ota_available;
271 * struct pnfs_osd_net_addr ota_netaddr;
272 * };
273 */
274static __be32 *
275__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr)
276{
277 u32 ota_available;
278
279 ota_available = be32_to_cpup(p++);
280 targetaddr->ota_available = ota_available;
281
282 if (ota_available)
283 p = __read_net_addr(p, &targetaddr->ota_netaddr);
284
285
286 return p;
287}
288
289/*
290 * struct pnfs_osd_deviceaddr {
291 * struct pnfs_osd_targetid oda_targetid;
292 * struct pnfs_osd_targetaddr oda_targetaddr;
293 * u8 oda_lun[8];
294 * struct nfs4_string oda_systemid;
295 * struct pnfs_osd_object_cred oda_root_obj_cred;
296 * struct nfs4_string oda_osdname;
297 * };
298 */
299
300/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does
301 * not have an xdr_stream
302 */
303static __be32 *
304__read_opaque_cred(__be32 *p,
305 struct pnfs_osd_opaque_cred *opaque_cred)
306{
307 opaque_cred->cred_len = be32_to_cpu(*p++);
308 opaque_cred->cred = p;
309 return p + XDR_QUADLEN(opaque_cred->cred_len);
310}
311
312static __be32 *
313__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp)
314{
315 p = _osd_xdr_decode_objid(p, &comp->oc_object_id);
316 comp->oc_osd_version = be32_to_cpup(p++);
317 comp->oc_cap_key_sec = be32_to_cpup(p++);
318
319 p = __read_opaque_cred(p, &comp->oc_cap_key);
320 p = __read_opaque_cred(p, &comp->oc_cap);
321 return p;
322}
323
324void pnfs_osd_xdr_decode_deviceaddr(
325 struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p)
326{
327 p = __read_targetid(p, &deviceaddr->oda_targetid);
328
329 p = __read_targetaddr(p, &deviceaddr->oda_targetaddr);
330
331 p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun,
332 sizeof(deviceaddr->oda_lun));
333
334 p = __read_u8_opaque(p, &deviceaddr->oda_systemid);
335
336 p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred);
337
338 p = __read_u8_opaque(p, &deviceaddr->oda_osdname);
339
340 /* libosd likes this terminated in dbg. It's last, so no problems */
341 deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0;
342}
343
344/*
345 * struct pnfs_osd_layoutupdate {
346 * u32 dsu_valid;
347 * s64 dsu_delta;
348 * u32 olu_ioerr_flag;
349 * }; xdr size 4 + 8 + 4
350 */
351int
352pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr,
353 struct pnfs_osd_layoutupdate *lou)
354{
355 __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4);
356
357 if (!p)
358 return -E2BIG;
359
360 *p++ = cpu_to_be32(lou->dsu_valid);
361 if (lou->dsu_valid)
362 p = xdr_encode_hyper(p, lou->dsu_delta);
363 *p++ = cpu_to_be32(lou->olu_ioerr_flag);
364 return 0;
365}
366
367/*
368 * struct pnfs_osd_objid {
369 * struct nfs4_deviceid oid_device_id;
370 * u64 oid_partition_id;
371 * u64 oid_object_id;
372 * }; // xdr size 32 bytes
373 */
374static inline __be32 *
375pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id)
376{
377 p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data,
378 sizeof(object_id->oid_device_id.data));
379 p = xdr_encode_hyper(p, object_id->oid_partition_id);
380 p = xdr_encode_hyper(p, object_id->oid_object_id);
381
382 return p;
383}
384
385/*
386 * struct pnfs_osd_ioerr {
387 * struct pnfs_osd_objid oer_component;
388 * u64 oer_comp_offset;
389 * u64 oer_comp_length;
390 * u32 oer_iswrite;
391 * u32 oer_errno;
392 * }; // xdr size 32 + 24 bytes
393 */
394void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr)
395{
396 p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component);
397 p = xdr_encode_hyper(p, ioerr->oer_comp_offset);
398 p = xdr_encode_hyper(p, ioerr->oer_comp_length);
399 *p++ = cpu_to_be32(ioerr->oer_iswrite);
400 *p = cpu_to_be32(ioerr->oer_errno);
401}
402
403__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr)
404{
405 __be32 *p;
406
407 p = xdr_reserve_space(xdr, 32 + 24);
408 if (unlikely(!p))
409 dprintk("%s: out of xdr space\n", __func__);
410
411 return p;
412}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index c80add6e2213..7913961aff22 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -204,6 +204,21 @@ nfs_wait_on_request(struct nfs_page *req)
204 TASK_UNINTERRUPTIBLE); 204 TASK_UNINTERRUPTIBLE);
205} 205}
206 206
207static bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
208{
209 /*
210 * FIXME: ideally we should be able to coalesce all requests
211 * that are not block boundary aligned, but currently this
212 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
213 * since nfs_flush_multi and nfs_pagein_multi assume you
214 * can have only one struct nfs_page.
215 */
216 if (desc->pg_bsize < PAGE_SIZE)
217 return 0;
218
219 return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
220}
221
207/** 222/**
208 * nfs_pageio_init - initialise a page io descriptor 223 * nfs_pageio_init - initialise a page io descriptor
209 * @desc: pointer to descriptor 224 * @desc: pointer to descriptor
@@ -229,6 +244,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
229 desc->pg_ioflags = io_flags; 244 desc->pg_ioflags = io_flags;
230 desc->pg_error = 0; 245 desc->pg_error = 0;
231 desc->pg_lseg = NULL; 246 desc->pg_lseg = NULL;
247 desc->pg_test = nfs_generic_pg_test;
248 pnfs_pageio_init(desc, inode);
232} 249}
233 250
234/** 251/**
@@ -242,29 +259,23 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
242 * 259 *
243 * Return 'true' if this is the case, else return 'false'. 260 * Return 'true' if this is the case, else return 'false'.
244 */ 261 */
245static int nfs_can_coalesce_requests(struct nfs_page *prev, 262static bool nfs_can_coalesce_requests(struct nfs_page *prev,
246 struct nfs_page *req, 263 struct nfs_page *req,
247 struct nfs_pageio_descriptor *pgio) 264 struct nfs_pageio_descriptor *pgio)
248{ 265{
249 if (req->wb_context->cred != prev->wb_context->cred) 266 if (req->wb_context->cred != prev->wb_context->cred)
250 return 0; 267 return false;
251 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) 268 if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner)
252 return 0; 269 return false;
253 if (req->wb_context->state != prev->wb_context->state) 270 if (req->wb_context->state != prev->wb_context->state)
254 return 0; 271 return false;
255 if (req->wb_index != (prev->wb_index + 1)) 272 if (req->wb_index != (prev->wb_index + 1))
256 return 0; 273 return false;
257 if (req->wb_pgbase != 0) 274 if (req->wb_pgbase != 0)
258 return 0; 275 return false;
259 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 276 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
260 return 0; 277 return false;
261 /* 278 return pgio->pg_test(pgio, prev, req);
262 * Non-whole file layouts need to check that req is inside of
263 * pgio->pg_lseg.
264 */
265 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
266 return 0;
267 return 1;
268} 279}
269 280
270/** 281/**
@@ -278,31 +289,18 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
278static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, 289static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
279 struct nfs_page *req) 290 struct nfs_page *req)
280{ 291{
281 size_t newlen = req->wb_bytes;
282
283 if (desc->pg_count != 0) { 292 if (desc->pg_count != 0) {
284 struct nfs_page *prev; 293 struct nfs_page *prev;
285 294
286 /*
287 * FIXME: ideally we should be able to coalesce all requests
288 * that are not block boundary aligned, but currently this
289 * is problematic for the case of bsize < PAGE_CACHE_SIZE,
290 * since nfs_flush_multi and nfs_pagein_multi assume you
291 * can have only one struct nfs_page.
292 */
293 if (desc->pg_bsize < PAGE_SIZE)
294 return 0;
295 newlen += desc->pg_count;
296 if (newlen > desc->pg_bsize)
297 return 0;
298 prev = nfs_list_entry(desc->pg_list.prev); 295 prev = nfs_list_entry(desc->pg_list.prev);
299 if (!nfs_can_coalesce_requests(prev, req, desc)) 296 if (!nfs_can_coalesce_requests(prev, req, desc))
300 return 0; 297 return 0;
301 } else 298 } else {
302 desc->pg_base = req->wb_pgbase; 299 desc->pg_base = req->wb_pgbase;
300 }
303 nfs_list_remove_request(req); 301 nfs_list_remove_request(req);
304 nfs_list_add_request(req, &desc->pg_list); 302 nfs_list_add_request(req, &desc->pg_list);
305 desc->pg_count = newlen; 303 desc->pg_count += req->wb_bytes;
306 return 1; 304 return 1;
307} 305}
308 306
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f57f5281a520..8c1309d852a6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -177,13 +177,28 @@ get_layout_hdr(struct pnfs_layout_hdr *lo)
177 atomic_inc(&lo->plh_refcount); 177 atomic_inc(&lo->plh_refcount);
178} 178}
179 179
180static struct pnfs_layout_hdr *
181pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
182{
183 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
184 return ld->alloc_layout_hdr ? ld->alloc_layout_hdr(ino, gfp_flags) :
185 kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags);
186}
187
188static void
189pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
190{
191 struct pnfs_layoutdriver_type *ld = NFS_SERVER(lo->plh_inode)->pnfs_curr_ld;
192 return ld->alloc_layout_hdr ? ld->free_layout_hdr(lo) : kfree(lo);
193}
194
180static void 195static void
181destroy_layout_hdr(struct pnfs_layout_hdr *lo) 196destroy_layout_hdr(struct pnfs_layout_hdr *lo)
182{ 197{
183 dprintk("%s: freeing layout cache %p\n", __func__, lo); 198 dprintk("%s: freeing layout cache %p\n", __func__, lo);
184 BUG_ON(!list_empty(&lo->plh_layouts)); 199 BUG_ON(!list_empty(&lo->plh_layouts));
185 NFS_I(lo->plh_inode)->layout = NULL; 200 NFS_I(lo->plh_inode)->layout = NULL;
186 kfree(lo); 201 pnfs_free_layout_hdr(lo);
187} 202}
188 203
189static void 204static void
@@ -228,7 +243,7 @@ put_lseg_common(struct pnfs_layout_segment *lseg)
228{ 243{
229 struct inode *inode = lseg->pls_layout->plh_inode; 244 struct inode *inode = lseg->pls_layout->plh_inode;
230 245
231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 246 WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
232 list_del_init(&lseg->pls_list); 247 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) { 248 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags); 249 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
@@ -261,11 +276,72 @@ put_lseg(struct pnfs_layout_segment *lseg)
261} 276}
262EXPORT_SYMBOL_GPL(put_lseg); 277EXPORT_SYMBOL_GPL(put_lseg);
263 278
279static inline u64
280end_offset(u64 start, u64 len)
281{
282 u64 end;
283
284 end = start + len;
285 return end >= start ? end : NFS4_MAX_UINT64;
286}
287
288/* last octet in a range */
289static inline u64
290last_byte_offset(u64 start, u64 len)
291{
292 u64 end;
293
294 BUG_ON(!len);
295 end = start + len;
296 return end > start ? end - 1 : NFS4_MAX_UINT64;
297}
298
299/*
300 * is l2 fully contained in l1?
301 * start1 end1
302 * [----------------------------------)
303 * start2 end2
304 * [----------------)
305 */
306static inline int
307lo_seg_contained(struct pnfs_layout_range *l1,
308 struct pnfs_layout_range *l2)
309{
310 u64 start1 = l1->offset;
311 u64 end1 = end_offset(start1, l1->length);
312 u64 start2 = l2->offset;
313 u64 end2 = end_offset(start2, l2->length);
314
315 return (start1 <= start2) && (end1 >= end2);
316}
317
318/*
319 * is l1 and l2 intersecting?
320 * start1 end1
321 * [----------------------------------)
322 * start2 end2
323 * [----------------)
324 */
325static inline int
326lo_seg_intersecting(struct pnfs_layout_range *l1,
327 struct pnfs_layout_range *l2)
328{
329 u64 start1 = l1->offset;
330 u64 end1 = end_offset(start1, l1->length);
331 u64 start2 = l2->offset;
332 u64 end2 = end_offset(start2, l2->length);
333
334 return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
335 (end2 == NFS4_MAX_UINT64 || end2 > start1);
336}
337
264static bool 338static bool
265should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 339should_free_lseg(struct pnfs_layout_range *lseg_range,
340 struct pnfs_layout_range *recall_range)
266{ 341{
267 return (recall_iomode == IOMODE_ANY || 342 return (recall_range->iomode == IOMODE_ANY ||
268 lseg_iomode == recall_iomode); 343 lseg_range->iomode == recall_range->iomode) &&
344 lo_seg_intersecting(lseg_range, recall_range);
269} 345}
270 346
271/* Returns 1 if lseg is removed from list, 0 otherwise */ 347/* Returns 1 if lseg is removed from list, 0 otherwise */
@@ -296,7 +372,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
296int 372int
297mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 373mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
298 struct list_head *tmp_list, 374 struct list_head *tmp_list,
299 u32 iomode) 375 struct pnfs_layout_range *recall_range)
300{ 376{
301 struct pnfs_layout_segment *lseg, *next; 377 struct pnfs_layout_segment *lseg, *next;
302 int invalid = 0, removed = 0; 378 int invalid = 0, removed = 0;
@@ -309,7 +385,8 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
309 return 0; 385 return 0;
310 } 386 }
311 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 387 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
312 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 388 if (!recall_range ||
389 should_free_lseg(&lseg->pls_range, recall_range)) {
313 dprintk("%s: freeing lseg %p iomode %d " 390 dprintk("%s: freeing lseg %p iomode %d "
314 "offset %llu length %llu\n", __func__, 391 "offset %llu length %llu\n", __func__,
315 lseg, lseg->pls_range.iomode, lseg->pls_range.offset, 392 lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
@@ -358,7 +435,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
358 lo = nfsi->layout; 435 lo = nfsi->layout;
359 if (lo) { 436 if (lo) {
360 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 437 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
361 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 438 mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
362 } 439 }
363 spin_unlock(&nfsi->vfs_inode.i_lock); 440 spin_unlock(&nfsi->vfs_inode.i_lock);
364 pnfs_free_lseg_list(&tmp_list); 441 pnfs_free_lseg_list(&tmp_list);
@@ -467,7 +544,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
467static struct pnfs_layout_segment * 544static struct pnfs_layout_segment *
468send_layoutget(struct pnfs_layout_hdr *lo, 545send_layoutget(struct pnfs_layout_hdr *lo,
469 struct nfs_open_context *ctx, 546 struct nfs_open_context *ctx,
470 u32 iomode, 547 struct pnfs_layout_range *range,
471 gfp_t gfp_flags) 548 gfp_t gfp_flags)
472{ 549{
473 struct inode *ino = lo->plh_inode; 550 struct inode *ino = lo->plh_inode;
@@ -499,11 +576,11 @@ send_layoutget(struct pnfs_layout_hdr *lo,
499 goto out_err_free; 576 goto out_err_free;
500 } 577 }
501 578
502 lgp->args.minlength = NFS4_MAX_UINT64; 579 lgp->args.minlength = PAGE_CACHE_SIZE;
580 if (lgp->args.minlength > range->length)
581 lgp->args.minlength = range->length;
503 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 582 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
504 lgp->args.range.iomode = iomode; 583 lgp->args.range = *range;
505 lgp->args.range.offset = 0;
506 lgp->args.range.length = NFS4_MAX_UINT64;
507 lgp->args.type = server->pnfs_curr_ld->id; 584 lgp->args.type = server->pnfs_curr_ld->id;
508 lgp->args.inode = ino; 585 lgp->args.inode = ino;
509 lgp->args.ctx = get_nfs_open_context(ctx); 586 lgp->args.ctx = get_nfs_open_context(ctx);
@@ -518,7 +595,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
518 nfs4_proc_layoutget(lgp); 595 nfs4_proc_layoutget(lgp);
519 if (!lseg) { 596 if (!lseg) {
520 /* remember that LAYOUTGET failed and suspend trying */ 597 /* remember that LAYOUTGET failed and suspend trying */
521 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 598 set_bit(lo_fail_bit(range->iomode), &lo->plh_flags);
522 } 599 }
523 600
524 /* free xdr pages */ 601 /* free xdr pages */
@@ -542,6 +619,51 @@ out_err_free:
542 return NULL; 619 return NULL;
543} 620}
544 621
622/* Initiates a LAYOUTRETURN(FILE) */
623int
624_pnfs_return_layout(struct inode *ino)
625{
626 struct pnfs_layout_hdr *lo = NULL;
627 struct nfs_inode *nfsi = NFS_I(ino);
628 LIST_HEAD(tmp_list);
629 struct nfs4_layoutreturn *lrp;
630 nfs4_stateid stateid;
631 int status = 0;
632
633 dprintk("--> %s\n", __func__);
634
635 spin_lock(&ino->i_lock);
636 lo = nfsi->layout;
637 if (!lo || !mark_matching_lsegs_invalid(lo, &tmp_list, NULL)) {
638 spin_unlock(&ino->i_lock);
639 dprintk("%s: no layout segments to return\n", __func__);
640 goto out;
641 }
642 stateid = nfsi->layout->plh_stateid;
643 /* Reference matched in nfs4_layoutreturn_release */
644 get_layout_hdr(lo);
645 spin_unlock(&ino->i_lock);
646 pnfs_free_lseg_list(&tmp_list);
647
648 WARN_ON(test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags));
649
650 lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
651 if (unlikely(lrp == NULL)) {
652 status = -ENOMEM;
653 goto out;
654 }
655
656 lrp->args.stateid = stateid;
657 lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
658 lrp->args.inode = ino;
659 lrp->clp = NFS_SERVER(ino)->nfs_client;
660
661 status = nfs4_proc_layoutreturn(lrp);
662out:
663 dprintk("<-- %s status: %d\n", __func__, status);
664 return status;
665}
666
545bool pnfs_roc(struct inode *ino) 667bool pnfs_roc(struct inode *ino)
546{ 668{
547 struct pnfs_layout_hdr *lo; 669 struct pnfs_layout_hdr *lo;
@@ -625,10 +747,23 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier)
625 * are seen first. 747 * are seen first.
626 */ 748 */
627static s64 749static s64
628cmp_layout(u32 iomode1, u32 iomode2) 750cmp_layout(struct pnfs_layout_range *l1,
751 struct pnfs_layout_range *l2)
629{ 752{
753 s64 d;
754
755 /* high offset > low offset */
756 d = l1->offset - l2->offset;
757 if (d)
758 return d;
759
760 /* short length > long length */
761 d = l2->length - l1->length;
762 if (d)
763 return d;
764
630 /* read > read/write */ 765 /* read > read/write */
631 return (int)(iomode2 == IOMODE_READ) - (int)(iomode1 == IOMODE_READ); 766 return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
632} 767}
633 768
634static void 769static void
@@ -636,13 +771,12 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
636 struct pnfs_layout_segment *lseg) 771 struct pnfs_layout_segment *lseg)
637{ 772{
638 struct pnfs_layout_segment *lp; 773 struct pnfs_layout_segment *lp;
639 int found = 0;
640 774
641 dprintk("%s:Begin\n", __func__); 775 dprintk("%s:Begin\n", __func__);
642 776
643 assert_spin_locked(&lo->plh_inode->i_lock); 777 assert_spin_locked(&lo->plh_inode->i_lock);
644 list_for_each_entry(lp, &lo->plh_segs, pls_list) { 778 list_for_each_entry(lp, &lo->plh_segs, pls_list) {
645 if (cmp_layout(lp->pls_range.iomode, lseg->pls_range.iomode) > 0) 779 if (cmp_layout(&lseg->pls_range, &lp->pls_range) > 0)
646 continue; 780 continue;
647 list_add_tail(&lseg->pls_list, &lp->pls_list); 781 list_add_tail(&lseg->pls_list, &lp->pls_list);
648 dprintk("%s: inserted lseg %p " 782 dprintk("%s: inserted lseg %p "
@@ -652,16 +786,14 @@ pnfs_insert_layout(struct pnfs_layout_hdr *lo,
652 lseg->pls_range.offset, lseg->pls_range.length, 786 lseg->pls_range.offset, lseg->pls_range.length,
653 lp, lp->pls_range.iomode, lp->pls_range.offset, 787 lp, lp->pls_range.iomode, lp->pls_range.offset,
654 lp->pls_range.length); 788 lp->pls_range.length);
655 found = 1; 789 goto out;
656 break;
657 }
658 if (!found) {
659 list_add_tail(&lseg->pls_list, &lo->plh_segs);
660 dprintk("%s: inserted lseg %p "
661 "iomode %d offset %llu length %llu at tail\n",
662 __func__, lseg, lseg->pls_range.iomode,
663 lseg->pls_range.offset, lseg->pls_range.length);
664 } 790 }
791 list_add_tail(&lseg->pls_list, &lo->plh_segs);
792 dprintk("%s: inserted lseg %p "
793 "iomode %d offset %llu length %llu at tail\n",
794 __func__, lseg, lseg->pls_range.iomode,
795 lseg->pls_range.offset, lseg->pls_range.length);
796out:
665 get_layout_hdr(lo); 797 get_layout_hdr(lo);
666 798
667 dprintk("%s:Return\n", __func__); 799 dprintk("%s:Return\n", __func__);
@@ -672,7 +804,7 @@ alloc_init_layout_hdr(struct inode *ino, gfp_t gfp_flags)
672{ 804{
673 struct pnfs_layout_hdr *lo; 805 struct pnfs_layout_hdr *lo;
674 806
675 lo = kzalloc(sizeof(struct pnfs_layout_hdr), gfp_flags); 807 lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
676 if (!lo) 808 if (!lo)
677 return NULL; 809 return NULL;
678 atomic_set(&lo->plh_refcount, 1); 810 atomic_set(&lo->plh_refcount, 1);
@@ -705,7 +837,7 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
705 if (likely(nfsi->layout == NULL)) /* Won the race? */ 837 if (likely(nfsi->layout == NULL)) /* Won the race? */
706 nfsi->layout = new; 838 nfsi->layout = new;
707 else 839 else
708 kfree(new); 840 pnfs_free_layout_hdr(new);
709 return nfsi->layout; 841 return nfsi->layout;
710} 842}
711 843
@@ -721,16 +853,28 @@ pnfs_find_alloc_layout(struct inode *ino, gfp_t gfp_flags)
721 * READ RW true 853 * READ RW true
722 */ 854 */
723static int 855static int
724is_matching_lseg(struct pnfs_layout_segment *lseg, u32 iomode) 856is_matching_lseg(struct pnfs_layout_range *ls_range,
857 struct pnfs_layout_range *range)
725{ 858{
726 return (iomode != IOMODE_RW || lseg->pls_range.iomode == IOMODE_RW); 859 struct pnfs_layout_range range1;
860
861 if ((range->iomode == IOMODE_RW &&
862 ls_range->iomode != IOMODE_RW) ||
863 !lo_seg_intersecting(ls_range, range))
864 return 0;
865
866 /* range1 covers only the first byte in the range */
867 range1 = *range;
868 range1.length = 1;
869 return lo_seg_contained(ls_range, &range1);
727} 870}
728 871
729/* 872/*
730 * lookup range in layout 873 * lookup range in layout
731 */ 874 */
732static struct pnfs_layout_segment * 875static struct pnfs_layout_segment *
733pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode) 876pnfs_find_lseg(struct pnfs_layout_hdr *lo,
877 struct pnfs_layout_range *range)
734{ 878{
735 struct pnfs_layout_segment *lseg, *ret = NULL; 879 struct pnfs_layout_segment *lseg, *ret = NULL;
736 880
@@ -739,11 +883,11 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
739 assert_spin_locked(&lo->plh_inode->i_lock); 883 assert_spin_locked(&lo->plh_inode->i_lock);
740 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 884 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
741 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 885 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
742 is_matching_lseg(lseg, iomode)) { 886 is_matching_lseg(&lseg->pls_range, range)) {
743 ret = get_lseg(lseg); 887 ret = get_lseg(lseg);
744 break; 888 break;
745 } 889 }
746 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 890 if (cmp_layout(range, &lseg->pls_range) > 0)
747 break; 891 break;
748 } 892 }
749 893
@@ -759,9 +903,17 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
759struct pnfs_layout_segment * 903struct pnfs_layout_segment *
760pnfs_update_layout(struct inode *ino, 904pnfs_update_layout(struct inode *ino,
761 struct nfs_open_context *ctx, 905 struct nfs_open_context *ctx,
906 loff_t pos,
907 u64 count,
762 enum pnfs_iomode iomode, 908 enum pnfs_iomode iomode,
763 gfp_t gfp_flags) 909 gfp_t gfp_flags)
764{ 910{
911 struct pnfs_layout_range arg = {
912 .iomode = iomode,
913 .offset = pos,
914 .length = count,
915 };
916 unsigned pg_offset;
765 struct nfs_inode *nfsi = NFS_I(ino); 917 struct nfs_inode *nfsi = NFS_I(ino);
766 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 918 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
767 struct pnfs_layout_hdr *lo; 919 struct pnfs_layout_hdr *lo;
@@ -789,7 +941,7 @@ pnfs_update_layout(struct inode *ino,
789 goto out_unlock; 941 goto out_unlock;
790 942
791 /* Check to see if the layout for the given range already exists */ 943 /* Check to see if the layout for the given range already exists */
792 lseg = pnfs_find_lseg(lo, iomode); 944 lseg = pnfs_find_lseg(lo, &arg);
793 if (lseg) 945 if (lseg)
794 goto out_unlock; 946 goto out_unlock;
795 947
@@ -811,7 +963,14 @@ pnfs_update_layout(struct inode *ino,
811 spin_unlock(&clp->cl_lock); 963 spin_unlock(&clp->cl_lock);
812 } 964 }
813 965
814 lseg = send_layoutget(lo, ctx, iomode, gfp_flags); 966 pg_offset = arg.offset & ~PAGE_CACHE_MASK;
967 if (pg_offset) {
968 arg.offset -= pg_offset;
969 arg.length += pg_offset;
970 }
971 arg.length = PAGE_CACHE_ALIGN(arg.length);
972
973 lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
815 if (!lseg && first) { 974 if (!lseg && first) {
816 spin_lock(&clp->cl_lock); 975 spin_lock(&clp->cl_lock);
817 list_del_init(&lo->plh_layouts); 976 list_del_init(&lo->plh_layouts);
@@ -838,17 +997,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
838 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 997 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
839 int status = 0; 998 int status = 0;
840 999
841 /* Verify we got what we asked for.
842 * Note that because the xdr parsing only accepts a single
843 * element array, this can fail even if the server is behaving
844 * correctly.
845 */
846 if (lgp->args.range.iomode > res->range.iomode ||
847 res->range.offset != 0 ||
848 res->range.length != NFS4_MAX_UINT64) {
849 status = -EINVAL;
850 goto out;
851 }
852 /* Inject layout blob into I/O device driver */ 1000 /* Inject layout blob into I/O device driver */
853 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags); 1001 lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
854 if (!lseg || IS_ERR(lseg)) { 1002 if (!lseg || IS_ERR(lseg)) {
@@ -895,51 +1043,64 @@ out_forget_reply:
895 goto out; 1043 goto out;
896} 1044}
897 1045
898static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio, 1046bool
899 struct nfs_page *prev, 1047pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
900 struct nfs_page *req) 1048 struct nfs_page *req)
901{ 1049{
1050 enum pnfs_iomode access_type;
1051 gfp_t gfp_flags;
1052
1053 /* We assume that pg_ioflags == 0 iff we're reading a page */
1054 if (pgio->pg_ioflags == 0) {
1055 access_type = IOMODE_READ;
1056 gfp_flags = GFP_KERNEL;
1057 } else {
1058 access_type = IOMODE_RW;
1059 gfp_flags = GFP_NOFS;
1060 }
1061
902 if (pgio->pg_count == prev->wb_bytes) { 1062 if (pgio->pg_count == prev->wb_bytes) {
903 /* This is first coelesce call for a series of nfs_pages */ 1063 /* This is first coelesce call for a series of nfs_pages */
904 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, 1064 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
905 prev->wb_context, 1065 prev->wb_context,
906 IOMODE_READ, 1066 req_offset(req),
907 GFP_KERNEL); 1067 pgio->pg_count,
1068 access_type,
1069 gfp_flags);
1070 return true;
908 } 1071 }
909 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
910}
911 1072
912void 1073 if (pgio->pg_lseg &&
913pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1074 req_offset(req) > end_offset(pgio->pg_lseg->pls_range.offset,
914{ 1075 pgio->pg_lseg->pls_range.length))
915 struct pnfs_layoutdriver_type *ld; 1076 return false;
916 1077
917 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1078 return true;
918 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
919} 1079}
1080EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
920 1081
921static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio, 1082/*
922 struct nfs_page *prev, 1083 * Called by non rpc-based layout drivers
923 struct nfs_page *req) 1084 */
1085int
1086pnfs_ld_write_done(struct nfs_write_data *data)
924{ 1087{
925 if (pgio->pg_count == prev->wb_bytes) { 1088 int status;
926 /* This is first coelesce call for a series of nfs_pages */
927 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
928 prev->wb_context,
929 IOMODE_RW,
930 GFP_NOFS);
931 }
932 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
933}
934 1089
935void 1090 if (!data->pnfs_error) {
936pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode) 1091 pnfs_set_layoutcommit(data);
937{ 1092 data->mds_ops->rpc_call_done(&data->task, data);
938 struct pnfs_layoutdriver_type *ld; 1093 data->mds_ops->rpc_release(data);
1094 return 0;
1095 }
939 1096
940 ld = NFS_SERVER(inode)->pnfs_curr_ld; 1097 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
941 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL; 1098 data->pnfs_error);
1099 status = nfs_initiate_write(data, NFS_CLIENT(data->inode),
1100 data->mds_ops, NFS_FILE_SYNC);
1101 return status ? : -EAGAIN;
942} 1102}
1103EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
943 1104
944enum pnfs_try_status 1105enum pnfs_try_status
945pnfs_try_to_write_data(struct nfs_write_data *wdata, 1106pnfs_try_to_write_data(struct nfs_write_data *wdata,
@@ -966,6 +1127,29 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
966} 1127}
967 1128
968/* 1129/*
1130 * Called by non rpc-based layout drivers
1131 */
1132int
1133pnfs_ld_read_done(struct nfs_read_data *data)
1134{
1135 int status;
1136
1137 if (!data->pnfs_error) {
1138 __nfs4_read_done_cb(data);
1139 data->mds_ops->rpc_call_done(&data->task, data);
1140 data->mds_ops->rpc_release(data);
1141 return 0;
1142 }
1143
1144 dprintk("%s: pnfs_error=%d, retry via MDS\n", __func__,
1145 data->pnfs_error);
1146 status = nfs_initiate_read(data, NFS_CLIENT(data->inode),
1147 data->mds_ops);
1148 return status ? : -EAGAIN;
1149}
1150EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1151
1152/*
969 * Call the appropriate parallel I/O subsystem read function. 1153 * Call the appropriate parallel I/O subsystem read function.
970 */ 1154 */
971enum pnfs_try_status 1155enum pnfs_try_status
@@ -1009,7 +1193,7 @@ void
1009pnfs_set_layoutcommit(struct nfs_write_data *wdata) 1193pnfs_set_layoutcommit(struct nfs_write_data *wdata)
1010{ 1194{
1011 struct nfs_inode *nfsi = NFS_I(wdata->inode); 1195 struct nfs_inode *nfsi = NFS_I(wdata->inode);
1012 loff_t end_pos = wdata->args.offset + wdata->res.count; 1196 loff_t end_pos = wdata->mds_offset + wdata->res.count;
1013 bool mark_as_dirty = false; 1197 bool mark_as_dirty = false;
1014 1198
1015 spin_lock(&nfsi->vfs_inode.i_lock); 1199 spin_lock(&nfsi->vfs_inode.i_lock);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0c015bad9e7a..48d0a8e4d062 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,7 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_fs.h>
33#include <linux/nfs_page.h> 34#include <linux/nfs_page.h>
34 35
35enum { 36enum {
@@ -64,17 +65,29 @@ enum {
64 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */ 65 NFS_LAYOUT_DESTROYED, /* no new use of layout allowed */
65}; 66};
66 67
68enum layoutdriver_policy_flags {
69 /* Should the pNFS client commit and return the layout upon a setattr */
70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
71};
72
73struct nfs4_deviceid_node;
74
67/* Per-layout driver specific registration structure */ 75/* Per-layout driver specific registration structure */
68struct pnfs_layoutdriver_type { 76struct pnfs_layoutdriver_type {
69 struct list_head pnfs_tblid; 77 struct list_head pnfs_tblid;
70 const u32 id; 78 const u32 id;
71 const char *name; 79 const char *name;
72 struct module *owner; 80 struct module *owner;
81 unsigned flags;
82
83 struct pnfs_layout_hdr * (*alloc_layout_hdr) (struct inode *inode, gfp_t gfp_flags);
84 void (*free_layout_hdr) (struct pnfs_layout_hdr *);
85
73 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 86 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
74 void (*free_lseg) (struct pnfs_layout_segment *lseg); 87 void (*free_lseg) (struct pnfs_layout_segment *lseg);
75 88
76 /* test for nfs page cache coalescing */ 89 /* test for nfs page cache coalescing */
77 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *); 90 bool (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
78 91
79 /* Returns true if layoutdriver wants to divert this request to 92 /* Returns true if layoutdriver wants to divert this request to
80 * driver's commit routine. 93 * driver's commit routine.
@@ -89,6 +102,16 @@ struct pnfs_layoutdriver_type {
89 */ 102 */
90 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data); 103 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
91 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how); 104 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
105
106 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
107
108 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
109 struct xdr_stream *xdr,
110 const struct nfs4_layoutreturn_args *args);
111
112 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid,
113 struct xdr_stream *xdr,
114 const struct nfs4_layoutcommit_args *args);
92}; 115};
93 116
94struct pnfs_layout_hdr { 117struct pnfs_layout_hdr {
@@ -120,21 +143,22 @@ extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
120extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 143extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
121 struct pnfs_device *dev); 144 struct pnfs_device *dev);
122extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp); 145extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
123 147
124/* pnfs.c */ 148/* pnfs.c */
125void get_layout_hdr(struct pnfs_layout_hdr *lo); 149void get_layout_hdr(struct pnfs_layout_hdr *lo);
126void put_lseg(struct pnfs_layout_segment *lseg); 150void put_lseg(struct pnfs_layout_segment *lseg);
127struct pnfs_layout_segment * 151struct pnfs_layout_segment *
128pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 152pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
129 enum pnfs_iomode access_type, gfp_t gfp_flags); 153 loff_t pos, u64 count, enum pnfs_iomode access_type,
154 gfp_t gfp_flags);
130void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 155void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
131void unset_pnfs_layoutdriver(struct nfs_server *); 156void unset_pnfs_layoutdriver(struct nfs_server *);
132enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *, 157enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
133 const struct rpc_call_ops *, int); 158 const struct rpc_call_ops *, int);
134enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *, 159enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
135 const struct rpc_call_ops *); 160 const struct rpc_call_ops *);
136void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *); 161bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
137void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
138int pnfs_layout_process(struct nfs4_layoutget *lgp); 162int pnfs_layout_process(struct nfs4_layoutget *lgp);
139void pnfs_free_lseg_list(struct list_head *tmp_list); 163void pnfs_free_lseg_list(struct list_head *tmp_list);
140void pnfs_destroy_layout(struct nfs_inode *); 164void pnfs_destroy_layout(struct nfs_inode *);
@@ -148,13 +172,37 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
148 struct nfs4_state *open_state); 172 struct nfs4_state *open_state);
149int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, 173int mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
150 struct list_head *tmp_list, 174 struct list_head *tmp_list,
151 u32 iomode); 175 struct pnfs_layout_range *recall_range);
152bool pnfs_roc(struct inode *ino); 176bool pnfs_roc(struct inode *ino);
153void pnfs_roc_release(struct inode *ino); 177void pnfs_roc_release(struct inode *ino);
154void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 178void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
155bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 179bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
156void pnfs_set_layoutcommit(struct nfs_write_data *wdata); 180void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
157int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 181int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
182int _pnfs_return_layout(struct inode *);
183int pnfs_ld_write_done(struct nfs_write_data *);
184int pnfs_ld_read_done(struct nfs_read_data *);
185
186/* pnfs_dev.c */
187struct nfs4_deviceid_node {
188 struct hlist_node node;
189 const struct pnfs_layoutdriver_type *ld;
190 const struct nfs_client *nfs_client;
191 struct nfs4_deviceid deviceid;
192 atomic_t ref;
193};
194
195void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id);
196struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
197struct nfs4_deviceid_node *nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
198void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
199void nfs4_init_deviceid_node(struct nfs4_deviceid_node *,
200 const struct pnfs_layoutdriver_type *,
201 const struct nfs_client *,
202 const struct nfs4_deviceid *);
203struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
204bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
205void nfs4_deviceid_purge_client(const struct nfs_client *);
158 206
159static inline int lo_fail_bit(u32 iomode) 207static inline int lo_fail_bit(u32 iomode)
160{ 208{
@@ -223,6 +271,36 @@ static inline void pnfs_clear_request_commit(struct nfs_page *req)
223 put_lseg(req->wb_commit_lseg); 271 put_lseg(req->wb_commit_lseg);
224} 272}
225 273
274/* Should the pNFS client commit and return the layout upon a setattr */
275static inline bool
276pnfs_ld_layoutret_on_setattr(struct inode *inode)
277{
278 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
279 return false;
280 return NFS_SERVER(inode)->pnfs_curr_ld->flags &
281 PNFS_LAYOUTRET_ON_SETATTR;
282}
283
284static inline int pnfs_return_layout(struct inode *ino)
285{
286 struct nfs_inode *nfsi = NFS_I(ino);
287 struct nfs_server *nfss = NFS_SERVER(ino);
288
289 if (pnfs_enabled_sb(nfss) && nfsi->layout)
290 return _pnfs_return_layout(ino);
291
292 return 0;
293}
294
295static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
296 struct inode *inode)
297{
298 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
299
300 if (ld)
301 pgio->pg_test = ld->pg_test;
302}
303
226#else /* CONFIG_NFS_V4_1 */ 304#else /* CONFIG_NFS_V4_1 */
227 305
228static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 306static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -245,7 +323,8 @@ static inline void put_lseg(struct pnfs_layout_segment *lseg)
245 323
246static inline struct pnfs_layout_segment * 324static inline struct pnfs_layout_segment *
247pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 325pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
248 enum pnfs_iomode access_type, gfp_t gfp_flags) 326 loff_t pos, u64 count, enum pnfs_iomode access_type,
327 gfp_t gfp_flags)
249{ 328{
250 return NULL; 329 return NULL;
251} 330}
@@ -264,6 +343,17 @@ pnfs_try_to_write_data(struct nfs_write_data *data,
264 return PNFS_NOT_ATTEMPTED; 343 return PNFS_NOT_ATTEMPTED;
265} 344}
266 345
346static inline int pnfs_return_layout(struct inode *ino)
347{
348 return 0;
349}
350
351static inline bool
352pnfs_ld_layoutret_on_setattr(struct inode *inode)
353{
354 return false;
355}
356
267static inline bool 357static inline bool
268pnfs_roc(struct inode *ino) 358pnfs_roc(struct inode *ino)
269{ 359{
@@ -294,16 +384,9 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
294{ 384{
295} 385}
296 386
297static inline void 387static inline void pnfs_pageio_init(struct nfs_pageio_descriptor *pgio,
298pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino) 388 struct inode *inode)
299{
300 pgio->pg_test = NULL;
301}
302
303static inline void
304pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
305{ 389{
306 pgio->pg_test = NULL;
307} 390}
308 391
309static inline void 392static inline void
@@ -331,6 +414,10 @@ static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
331{ 414{
332 return 0; 415 return 0;
333} 416}
417
418static inline void nfs4_deviceid_purge_client(struct nfs_client *ncl)
419{
420}
334#endif /* CONFIG_NFS_V4_1 */ 421#endif /* CONFIG_NFS_V4_1 */
335 422
336#endif /* FS_NFS_PNFS_H */ 423#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
new file mode 100644
index 000000000000..c65e133ce9c0
--- /dev/null
+++ b/fs/nfs/pnfs_dev.c
@@ -0,0 +1,270 @@
1/*
2 * Device operations for the pnfs client.
3 *
4 * Copyright (c) 2002
5 * The Regents of the University of Michigan
6 * All Rights Reserved
7 *
8 * Dean Hildebrand <dhildebz@umich.edu>
9 * Garth Goodson <Garth.Goodson@netapp.com>
10 *
11 * Permission is granted to use, copy, create derivative works, and
12 * redistribute this software and such derivative works for any purpose,
13 * so long as the name of the University of Michigan is not used in
14 * any advertising or publicity pertaining to the use or distribution
15 * of this software without specific, written prior authorization. If
16 * the above copyright notice or any other identification of the
17 * University of Michigan is included in any copy of any portion of
18 * this software, then the disclaimer below must also be included.
19 *
20 * This software is provided as is, without representation or warranty
21 * of any kind either express or implied, including without limitation
22 * the implied warranties of merchantability, fitness for a particular
23 * purpose, or noninfringement. The Regents of the University of
24 * Michigan shall not be liable for any damages, including special,
25 * indirect, incidental, or consequential damages, with respect to any
26 * claim arising out of or in connection with the use of the software,
27 * even if it has been or is hereafter advised of the possibility of
28 * such damages.
29 */
30
31#include "pnfs.h"
32
33#define NFSDBG_FACILITY NFSDBG_PNFS
34
35/*
36 * Device ID RCU cache. A device ID is unique per server and layout type.
37 */
38#define NFS4_DEVICE_ID_HASH_BITS 5
39#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
40#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
41
42static struct hlist_head nfs4_deviceid_cache[NFS4_DEVICE_ID_HASH_SIZE];
43static DEFINE_SPINLOCK(nfs4_deviceid_lock);
44
45void
46nfs4_print_deviceid(const struct nfs4_deviceid *id)
47{
48 u32 *p = (u32 *)id;
49
50 dprintk("%s: device id= [%x%x%x%x]\n", __func__,
51 p[0], p[1], p[2], p[3]);
52}
53EXPORT_SYMBOL_GPL(nfs4_print_deviceid);
54
55static inline u32
56nfs4_deviceid_hash(const struct nfs4_deviceid *id)
57{
58 unsigned char *cptr = (unsigned char *)id->data;
59 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
60 u32 x = 0;
61
62 while (nbytes--) {
63 x *= 37;
64 x += *cptr++;
65 }
66 return x & NFS4_DEVICE_ID_HASH_MASK;
67}
68
69static struct nfs4_deviceid_node *
70_lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
71 const struct nfs_client *clp, const struct nfs4_deviceid *id,
72 long hash)
73{
74 struct nfs4_deviceid_node *d;
75 struct hlist_node *n;
76
77 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
78 if (d->ld == ld && d->nfs_client == clp &&
79 !memcmp(&d->deviceid, id, sizeof(*id))) {
80 if (atomic_read(&d->ref))
81 return d;
82 else
83 continue;
84 }
85 return NULL;
86}
87
88/*
89 * Lookup a deviceid in cache and get a reference count on it if found
90 *
91 * @clp nfs_client associated with deviceid
92 * @id deviceid to look up
93 */
94struct nfs4_deviceid_node *
95_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
96 const struct nfs_client *clp, const struct nfs4_deviceid *id,
97 long hash)
98{
99 struct nfs4_deviceid_node *d;
100
101 rcu_read_lock();
102 d = _lookup_deviceid(ld, clp, id, hash);
103 if (d && !atomic_inc_not_zero(&d->ref))
104 d = NULL;
105 rcu_read_unlock();
106 return d;
107}
108
109struct nfs4_deviceid_node *
110nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111 const struct nfs_client *clp, const struct nfs4_deviceid *id)
112{
113 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
114}
115EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
116
117/*
118 * Unhash and put deviceid
119 *
120 * @clp nfs_client associated with deviceid
121 * @id the deviceid to unhash
122 *
123 * @ret the unhashed node, if found and dereferenced to zero, NULL otherwise.
124 */
125struct nfs4_deviceid_node *
126nfs4_unhash_put_deviceid(const struct pnfs_layoutdriver_type *ld,
127 const struct nfs_client *clp, const struct nfs4_deviceid *id)
128{
129 struct nfs4_deviceid_node *d;
130
131 spin_lock(&nfs4_deviceid_lock);
132 rcu_read_lock();
133 d = _lookup_deviceid(ld, clp, id, nfs4_deviceid_hash(id));
134 rcu_read_unlock();
135 if (!d) {
136 spin_unlock(&nfs4_deviceid_lock);
137 return NULL;
138 }
139 hlist_del_init_rcu(&d->node);
140 spin_unlock(&nfs4_deviceid_lock);
141 synchronize_rcu();
142
143 /* balance the initial ref set in pnfs_insert_deviceid */
144 if (atomic_dec_and_test(&d->ref))
145 return d;
146
147 return NULL;
148}
149EXPORT_SYMBOL_GPL(nfs4_unhash_put_deviceid);
150
151/*
152 * Delete a deviceid from cache
153 *
154 * @clp struct nfs_client qualifying the deviceid
155 * @id deviceid to delete
156 */
157void
158nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
159 const struct nfs_client *clp, const struct nfs4_deviceid *id)
160{
161 struct nfs4_deviceid_node *d;
162
163 d = nfs4_unhash_put_deviceid(ld, clp, id);
164 if (!d)
165 return;
166 d->ld->free_deviceid_node(d);
167}
168EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
169
170void
171nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
172 const struct pnfs_layoutdriver_type *ld,
173 const struct nfs_client *nfs_client,
174 const struct nfs4_deviceid *id)
175{
176 INIT_HLIST_NODE(&d->node);
177 d->ld = ld;
178 d->nfs_client = nfs_client;
179 d->deviceid = *id;
180 atomic_set(&d->ref, 1);
181}
182EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
183
184/*
185 * Uniquely initialize and insert a deviceid node into cache
186 *
187 * @new new deviceid node
188 * Note that the caller must set up the following members:
189 * new->ld
190 * new->nfs_client
191 * new->deviceid
192 *
193 * @ret the inserted node, if none found, otherwise, the found entry.
194 */
195struct nfs4_deviceid_node *
196nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
197{
198 struct nfs4_deviceid_node *d;
199 long hash;
200
201 spin_lock(&nfs4_deviceid_lock);
202 hash = nfs4_deviceid_hash(&new->deviceid);
203 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
204 if (d) {
205 spin_unlock(&nfs4_deviceid_lock);
206 return d;
207 }
208
209 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
210 spin_unlock(&nfs4_deviceid_lock);
211
212 return new;
213}
214EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
215
216/*
217 * Dereference a deviceid node and delete it when its reference count drops
218 * to zero.
219 *
220 * @d deviceid node to put
221 *
222 * @ret true iff the node was deleted
223 */
224bool
225nfs4_put_deviceid_node(struct nfs4_deviceid_node *d)
226{
227 if (!atomic_dec_and_lock(&d->ref, &nfs4_deviceid_lock))
228 return false;
229 hlist_del_init_rcu(&d->node);
230 spin_unlock(&nfs4_deviceid_lock);
231 synchronize_rcu();
232 d->ld->free_deviceid_node(d);
233 return true;
234}
235EXPORT_SYMBOL_GPL(nfs4_put_deviceid_node);
236
237static void
238_deviceid_purge_client(const struct nfs_client *clp, long hash)
239{
240 struct nfs4_deviceid_node *d;
241 struct hlist_node *n, *next;
242 HLIST_HEAD(tmp);
243
244 rcu_read_lock();
245 hlist_for_each_entry_rcu(d, n, &nfs4_deviceid_cache[hash], node)
246 if (d->nfs_client == clp && atomic_read(&d->ref)) {
247 hlist_del_init_rcu(&d->node);
248 hlist_add_head(&d->node, &tmp);
249 }
250 rcu_read_unlock();
251
252 if (hlist_empty(&tmp))
253 return;
254
255 synchronize_rcu();
256 hlist_for_each_entry_safe(d, n, next, &tmp, node)
257 if (atomic_dec_and_test(&d->ref))
258 d->ld->free_deviceid_node(d);
259}
260
261void
262nfs4_deviceid_purge_client(const struct nfs_client *clp)
263{
264 long h;
265
266 spin_lock(&nfs4_deviceid_lock);
267 for (h = 0; h < NFS4_DEVICE_ID_HASH_SIZE; h++)
268 _deviceid_purge_client(clp, h);
269 spin_unlock(&nfs4_deviceid_lock);
270}
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 2bcf0dc306a1..20a7f952e244 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -288,7 +288,9 @@ static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
288 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
289 289
290 BUG_ON(desc->pg_lseg != NULL); 290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
292 req_offset(req), desc->pg_count,
293 IOMODE_READ, GFP_KERNEL);
292 ClearPageError(page); 294 ClearPageError(page);
293 offset = 0; 295 offset = 0;
294 nbytes = desc->pg_count; 296 nbytes = desc->pg_count;
@@ -351,7 +353,9 @@ static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
351 } 353 }
352 req = nfs_list_entry(data->pages.next); 354 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages)) 355 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ, GFP_KERNEL); 356 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
357 req_offset(req), desc->pg_count,
358 IOMODE_READ, GFP_KERNEL);
355 359
356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count, 360 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
357 0, lseg); 361 0, lseg);
@@ -660,7 +664,6 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
660 if (ret == 0) 664 if (ret == 0)
661 goto read_complete; /* all pages were read */ 665 goto read_complete; /* all pages were read */
662 666
663 pnfs_pageio_init_read(&pgio, inode);
664 if (rsize < PAGE_CACHE_SIZE) 667 if (rsize < PAGE_CACHE_SIZE)
665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 668 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
666 else 669 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e288f06d3fa7..ce40e5c568ba 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -63,6 +63,7 @@
63#include "iostat.h" 63#include "iostat.h"
64#include "internal.h" 64#include "internal.h"
65#include "fscache.h" 65#include "fscache.h"
66#include "pnfs.h"
66 67
67#define NFSDBG_FACILITY NFSDBG_VFS 68#define NFSDBG_FACILITY NFSDBG_VFS
68 69
@@ -732,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
732 733
733 return 0; 734 return 0;
734} 735}
736#ifdef CONFIG_NFS_V4_1
737void show_sessions(struct seq_file *m, struct nfs_server *server)
738{
739 if (nfs4_has_session(server->nfs_client))
740 seq_printf(m, ",sessions");
741}
742#else
743void show_sessions(struct seq_file *m, struct nfs_server *server) {}
744#endif
745
746#ifdef CONFIG_NFS_V4_1
747void show_pnfs(struct seq_file *m, struct nfs_server *server)
748{
749 seq_printf(m, ",pnfs=");
750 if (server->pnfs_curr_ld)
751 seq_printf(m, "%s", server->pnfs_curr_ld->name);
752 else
753 seq_printf(m, "not configured");
754}
755#else /* CONFIG_NFS_V4_1 */
756void show_pnfs(struct seq_file *m, struct nfs_server *server) {}
757#endif /* CONFIG_NFS_V4_1 */
735 758
736static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt) 759static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
737{ 760{
@@ -792,6 +815,8 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
792 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]); 815 seq_printf(m, "bm0=0x%x", nfss->attr_bitmask[0]);
793 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]); 816 seq_printf(m, ",bm1=0x%x", nfss->attr_bitmask[1]);
794 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask); 817 seq_printf(m, ",acl=0x%x", nfss->acl_bitmask);
818 show_sessions(m, nfss);
819 show_pnfs(m, nfss);
795 } 820 }
796#endif 821#endif
797 822
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 49c715b4ac92..e268e3b23497 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -939,7 +939,9 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
939 atomic_set(&req->wb_complete, requests); 939 atomic_set(&req->wb_complete, requests);
940 940
941 BUG_ON(desc->pg_lseg); 941 BUG_ON(desc->pg_lseg);
942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 942 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
943 req_offset(req), desc->pg_count,
944 IOMODE_RW, GFP_NOFS);
943 ClearPageError(page); 945 ClearPageError(page);
944 offset = 0; 946 offset = 0;
945 nbytes = desc->pg_count; 947 nbytes = desc->pg_count;
@@ -1013,7 +1015,9 @@ static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
1013 } 1015 }
1014 req = nfs_list_entry(data->pages.next); 1016 req = nfs_list_entry(data->pages.next);
1015 if ((!lseg) && list_is_singular(&data->pages)) 1017 if ((!lseg) && list_is_singular(&data->pages))
1016 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW, GFP_NOFS); 1018 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context,
1019 req_offset(req), desc->pg_count,
1020 IOMODE_RW, GFP_NOFS);
1017 1021
1018 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 1022 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1019 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit)) 1023 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
@@ -1032,8 +1036,6 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
1032{ 1036{
1033 size_t wsize = NFS_SERVER(inode)->wsize; 1037 size_t wsize = NFS_SERVER(inode)->wsize;
1034 1038
1035 pnfs_pageio_init_write(pgio, inode);
1036
1037 if (wsize < PAGE_CACHE_SIZE) 1039 if (wsize < PAGE_CACHE_SIZE)
1038 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1040 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
1039 else 1041 else
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index ad000aeb21a2..b9566e46219f 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1354,12 +1354,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
1354 if (IS_ERR(exp)) 1354 if (IS_ERR(exp))
1355 return nfserrno(PTR_ERR(exp)); 1355 return nfserrno(PTR_ERR(exp));
1356 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL); 1356 rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
1357 if (rv)
1358 goto out;
1359 rv = check_nfsd_access(exp, rqstp);
1360 if (rv)
1361 fh_put(fhp);
1362out:
1363 exp_put(exp); 1357 exp_put(exp);
1364 return rv; 1358 return rv;
1365} 1359}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2247fc91d5e9..9095f3c21df9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -245,7 +245,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
245 } 245 }
246 246
247 /* Now create the file and set attributes */ 247 /* Now create the file and set attributes */
248 nfserr = nfsd_create_v3(rqstp, dirfhp, argp->name, argp->len, 248 nfserr = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
249 attr, newfhp, 249 attr, newfhp,
250 argp->createmode, argp->verf, NULL, NULL); 250 argp->createmode, argp->verf, NULL, NULL);
251 251
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index ad48faca20fc..08c6e36ab2eb 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -842,7 +842,7 @@ out:
842 return rv; 842 return rv;
843} 843}
844 844
845__be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) 845static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
846{ 846{
847 struct svc_fh fh; 847 struct svc_fh fh;
848 int err; 848 int err;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5fcb1396a7e3..3a6dbd70b34b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -196,9 +196,9 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
196 196
197 /* 197 /*
198 * Note: create modes (UNCHECKED,GUARDED...) are the same 198 * Note: create modes (UNCHECKED,GUARDED...) are the same
199 * in NFSv4 as in v3. 199 * in NFSv4 as in v3 except EXCLUSIVE4_1.
200 */ 200 */
201 status = nfsd_create_v3(rqstp, current_fh, open->op_fname.data, 201 status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
202 open->op_fname.len, &open->op_iattr, 202 open->op_fname.len, &open->op_iattr,
203 &resfh, open->op_createmode, 203 &resfh, open->op_createmode,
204 (u32 *)open->op_verf.data, 204 (u32 *)open->op_verf.data,
@@ -403,7 +403,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
403 cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen; 403 cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
404 memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval, 404 memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
405 putfh->pf_fhlen); 405 putfh->pf_fhlen);
406 return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 406 return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
407} 407}
408 408
409static __be32 409static __be32
@@ -762,6 +762,9 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
762 __be32 err; 762 __be32 err;
763 763
764 fh_init(&resfh, NFS4_FHSIZE); 764 fh_init(&resfh, NFS4_FHSIZE);
765 err = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_EXEC);
766 if (err)
767 return err;
765 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh, 768 err = nfsd_lookup_dentry(rqstp, &cstate->current_fh,
766 secinfo->si_name, secinfo->si_namelen, 769 secinfo->si_name, secinfo->si_namelen,
767 &exp, &dentry); 770 &exp, &dentry);
@@ -986,6 +989,9 @@ enum nfsd4_op_flags {
986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 989 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
987 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ 990 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
988 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ 991 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
992 /* For rfc 5661 section 2.6.3.1.1: */
993 OP_HANDLES_WRONGSEC = 1 << 3,
994 OP_IS_PUTFH_LIKE = 1 << 4,
989}; 995};
990 996
991struct nfsd4_operation { 997struct nfsd4_operation {
@@ -1031,6 +1037,44 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args)
1031 return nfs_ok; 1037 return nfs_ok;
1032} 1038}
1033 1039
1040static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
1041{
1042 return &nfsd4_ops[op->opnum];
1043}
1044
1045static bool need_wrongsec_check(struct svc_rqst *rqstp)
1046{
1047 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1048 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1049 struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
1050 struct nfsd4_op *next = &argp->ops[resp->opcnt];
1051 struct nfsd4_operation *thisd;
1052 struct nfsd4_operation *nextd;
1053
1054 thisd = OPDESC(this);
1055 /*
1056 * Most ops check wronsec on our own; only the putfh-like ops
1057 * have special rules.
1058 */
1059 if (!(thisd->op_flags & OP_IS_PUTFH_LIKE))
1060 return false;
1061 /*
1062 * rfc 5661 2.6.3.1.1.6: don't bother erroring out a
1063 * put-filehandle operation if we're not going to use the
1064 * result:
1065 */
1066 if (argp->opcnt == resp->opcnt)
1067 return false;
1068
1069 nextd = OPDESC(next);
1070 /*
1071 * Rest of 2.6.3.1.1: certain operations will return WRONGSEC
1072 * errors themselves as necessary; others should check for them
1073 * now:
1074 */
1075 return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
1076}
1077
1034/* 1078/*
1035 * COMPOUND call. 1079 * COMPOUND call.
1036 */ 1080 */
@@ -1108,7 +1152,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1108 goto encode_op; 1152 goto encode_op;
1109 } 1153 }
1110 1154
1111 opdesc = &nfsd4_ops[op->opnum]; 1155 opdesc = OPDESC(op);
1112 1156
1113 if (!cstate->current_fh.fh_dentry) { 1157 if (!cstate->current_fh.fh_dentry) {
1114 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { 1158 if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
@@ -1126,6 +1170,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1126 else 1170 else
1127 BUG_ON(op->status == nfs_ok); 1171 BUG_ON(op->status == nfs_ok);
1128 1172
1173 if (!op->status && need_wrongsec_check(rqstp))
1174 op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
1175
1129encode_op: 1176encode_op:
1130 /* Only from SEQUENCE */ 1177 /* Only from SEQUENCE */
1131 if (resp->cstate.status == nfserr_replay_cache) { 1178 if (resp->cstate.status == nfserr_replay_cache) {
@@ -1217,10 +1264,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1217 }, 1264 },
1218 [OP_LOOKUP] = { 1265 [OP_LOOKUP] = {
1219 .op_func = (nfsd4op_func)nfsd4_lookup, 1266 .op_func = (nfsd4op_func)nfsd4_lookup,
1267 .op_flags = OP_HANDLES_WRONGSEC,
1220 .op_name = "OP_LOOKUP", 1268 .op_name = "OP_LOOKUP",
1221 }, 1269 },
1222 [OP_LOOKUPP] = { 1270 [OP_LOOKUPP] = {
1223 .op_func = (nfsd4op_func)nfsd4_lookupp, 1271 .op_func = (nfsd4op_func)nfsd4_lookupp,
1272 .op_flags = OP_HANDLES_WRONGSEC,
1224 .op_name = "OP_LOOKUPP", 1273 .op_name = "OP_LOOKUPP",
1225 }, 1274 },
1226 [OP_NVERIFY] = { 1275 [OP_NVERIFY] = {
@@ -1229,6 +1278,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1229 }, 1278 },
1230 [OP_OPEN] = { 1279 [OP_OPEN] = {
1231 .op_func = (nfsd4op_func)nfsd4_open, 1280 .op_func = (nfsd4op_func)nfsd4_open,
1281 .op_flags = OP_HANDLES_WRONGSEC,
1232 .op_name = "OP_OPEN", 1282 .op_name = "OP_OPEN",
1233 }, 1283 },
1234 [OP_OPEN_CONFIRM] = { 1284 [OP_OPEN_CONFIRM] = {
@@ -1241,17 +1291,20 @@ static struct nfsd4_operation nfsd4_ops[] = {
1241 }, 1291 },
1242 [OP_PUTFH] = { 1292 [OP_PUTFH] = {
1243 .op_func = (nfsd4op_func)nfsd4_putfh, 1293 .op_func = (nfsd4op_func)nfsd4_putfh,
1244 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1294 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1295 | OP_IS_PUTFH_LIKE,
1245 .op_name = "OP_PUTFH", 1296 .op_name = "OP_PUTFH",
1246 }, 1297 },
1247 [OP_PUTPUBFH] = { 1298 [OP_PUTPUBFH] = {
1248 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1299 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1249 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1300 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1301 | OP_IS_PUTFH_LIKE,
1250 .op_name = "OP_PUTPUBFH", 1302 .op_name = "OP_PUTPUBFH",
1251 }, 1303 },
1252 [OP_PUTROOTFH] = { 1304 [OP_PUTROOTFH] = {
1253 .op_func = (nfsd4op_func)nfsd4_putrootfh, 1305 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1254 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1306 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1307 | OP_IS_PUTFH_LIKE,
1255 .op_name = "OP_PUTROOTFH", 1308 .op_name = "OP_PUTROOTFH",
1256 }, 1309 },
1257 [OP_READ] = { 1310 [OP_READ] = {
@@ -1281,15 +1334,18 @@ static struct nfsd4_operation nfsd4_ops[] = {
1281 }, 1334 },
1282 [OP_RESTOREFH] = { 1335 [OP_RESTOREFH] = {
1283 .op_func = (nfsd4op_func)nfsd4_restorefh, 1336 .op_func = (nfsd4op_func)nfsd4_restorefh,
1284 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1337 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
1338 | OP_IS_PUTFH_LIKE,
1285 .op_name = "OP_RESTOREFH", 1339 .op_name = "OP_RESTOREFH",
1286 }, 1340 },
1287 [OP_SAVEFH] = { 1341 [OP_SAVEFH] = {
1288 .op_func = (nfsd4op_func)nfsd4_savefh, 1342 .op_func = (nfsd4op_func)nfsd4_savefh,
1343 .op_flags = OP_HANDLES_WRONGSEC,
1289 .op_name = "OP_SAVEFH", 1344 .op_name = "OP_SAVEFH",
1290 }, 1345 },
1291 [OP_SECINFO] = { 1346 [OP_SECINFO] = {
1292 .op_func = (nfsd4op_func)nfsd4_secinfo, 1347 .op_func = (nfsd4op_func)nfsd4_secinfo,
1348 .op_flags = OP_HANDLES_WRONGSEC,
1293 .op_name = "OP_SECINFO", 1349 .op_name = "OP_SECINFO",
1294 }, 1350 },
1295 [OP_SETATTR] = { 1351 [OP_SETATTR] = {
@@ -1353,6 +1409,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1353 }, 1409 },
1354 [OP_SECINFO_NO_NAME] = { 1410 [OP_SECINFO_NO_NAME] = {
1355 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name, 1411 .op_func = (nfsd4op_func)nfsd4_secinfo_no_name,
1412 .op_flags = OP_HANDLES_WRONGSEC,
1356 .op_name = "OP_SECINFO_NO_NAME", 1413 .op_name = "OP_SECINFO_NO_NAME",
1357 }, 1414 },
1358}; 1415};
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4cf04e11c66c..e98f3c2e9492 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1519,6 +1519,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1519 bool confirm_me = false; 1519 bool confirm_me = false;
1520 int status = 0; 1520 int status = 0;
1521 1521
1522 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1523 return nfserr_inval;
1524
1522 nfs4_lock_state(); 1525 nfs4_lock_state();
1523 unconf = find_unconfirmed_client(&cr_ses->clientid); 1526 unconf = find_unconfirmed_client(&cr_ses->clientid);
1524 conf = find_confirmed_client(&cr_ses->clientid); 1527 conf = find_confirmed_client(&cr_ses->clientid);
@@ -1637,8 +1640,9 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1637 return nfserr_badsession; 1640 return nfserr_badsession;
1638 1641
1639 status = nfsd4_map_bcts_dir(&bcts->dir); 1642 status = nfsd4_map_bcts_dir(&bcts->dir);
1640 nfsd4_new_conn(rqstp, cstate->session, bcts->dir); 1643 if (!status)
1641 return nfs_ok; 1644 nfsd4_new_conn(rqstp, cstate->session, bcts->dir);
1645 return status;
1642} 1646}
1643 1647
1644static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid) 1648static bool nfsd4_compound_in_session(struct nfsd4_session *session, struct nfs4_sessionid *sid)
@@ -1725,6 +1729,13 @@ static void nfsd4_sequence_check_conn(struct nfsd4_conn *new, struct nfsd4_sessi
1725 return; 1729 return;
1726} 1730}
1727 1731
1732static bool nfsd4_session_too_many_ops(struct svc_rqst *rqstp, struct nfsd4_session *session)
1733{
1734 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1735
1736 return args->opcnt > session->se_fchannel.maxops;
1737}
1738
1728__be32 1739__be32
1729nfsd4_sequence(struct svc_rqst *rqstp, 1740nfsd4_sequence(struct svc_rqst *rqstp,
1730 struct nfsd4_compound_state *cstate, 1741 struct nfsd4_compound_state *cstate,
@@ -1753,6 +1764,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
1753 if (!session) 1764 if (!session)
1754 goto out; 1765 goto out;
1755 1766
1767 status = nfserr_too_many_ops;
1768 if (nfsd4_session_too_many_ops(rqstp, session))
1769 goto out;
1770
1756 status = nfserr_badslot; 1771 status = nfserr_badslot;
1757 if (seq->slotid >= session->se_fchannel.maxreqs) 1772 if (seq->slotid >= session->se_fchannel.maxreqs)
1758 goto out; 1773 goto out;
@@ -1808,6 +1823,8 @@ out:
1808__be32 1823__be32
1809nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc) 1824nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
1810{ 1825{
1826 int status = 0;
1827
1811 if (rc->rca_one_fs) { 1828 if (rc->rca_one_fs) {
1812 if (!cstate->current_fh.fh_dentry) 1829 if (!cstate->current_fh.fh_dentry)
1813 return nfserr_nofilehandle; 1830 return nfserr_nofilehandle;
@@ -1817,9 +1834,14 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
1817 */ 1834 */
1818 return nfs_ok; 1835 return nfs_ok;
1819 } 1836 }
1837
1820 nfs4_lock_state(); 1838 nfs4_lock_state();
1821 if (is_client_expired(cstate->session->se_client)) { 1839 status = nfserr_complete_already;
1822 nfs4_unlock_state(); 1840 if (cstate->session->se_client->cl_firststate)
1841 goto out;
1842
1843 status = nfserr_stale_clientid;
1844 if (is_client_expired(cstate->session->se_client))
1823 /* 1845 /*
1824 * The following error isn't really legal. 1846 * The following error isn't really legal.
1825 * But we only get here if the client just explicitly 1847 * But we only get here if the client just explicitly
@@ -1827,11 +1849,13 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
1827 * error it gets back on an operation for the dead 1849 * error it gets back on an operation for the dead
1828 * client. 1850 * client.
1829 */ 1851 */
1830 return nfserr_stale_clientid; 1852 goto out;
1831 } 1853
1854 status = nfs_ok;
1832 nfsd4_create_clid_dir(cstate->session->se_client); 1855 nfsd4_create_clid_dir(cstate->session->se_client);
1856out:
1833 nfs4_unlock_state(); 1857 nfs4_unlock_state();
1834 return nfs_ok; 1858 return status;
1835} 1859}
1836 1860
1837__be32 1861__be32
@@ -2462,7 +2486,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2462 return NULL; 2486 return NULL;
2463} 2487}
2464 2488
2465int share_access_to_flags(u32 share_access) 2489static int share_access_to_flags(u32 share_access)
2466{ 2490{
2467 share_access &= ~NFS4_SHARE_WANT_MASK; 2491 share_access &= ~NFS4_SHARE_WANT_MASK;
2468 2492
@@ -2882,7 +2906,7 @@ out:
2882 return status; 2906 return status;
2883} 2907}
2884 2908
2885struct lock_manager nfsd4_manager = { 2909static struct lock_manager nfsd4_manager = {
2886}; 2910};
2887 2911
2888static void 2912static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6766af00d98..990181103214 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,15 +424,12 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) 424static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
425{ 425{
426 DECODE_HEAD; 426 DECODE_HEAD;
427 u32 dummy;
428 427
429 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8); 428 READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
430 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN); 429 COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
431 READ32(bcts->dir); 430 READ32(bcts->dir);
432 /* XXX: Perhaps Tom Tucker could help us figure out how we 431 /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
433 * should be using ctsa_use_conn_in_rdma_mode: */ 432 * could help us figure out we should be using it. */
434 READ32(dummy);
435
436 DECODE_TAIL; 433 DECODE_TAIL;
437} 434}
438 435
@@ -588,8 +585,6 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
588 READ_BUF(lockt->lt_owner.len); 585 READ_BUF(lockt->lt_owner.len);
589 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 586 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
590 587
591 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
592 return nfserr_inval;
593 DECODE_TAIL; 588 DECODE_TAIL;
594} 589}
595 590
@@ -3120,7 +3115,7 @@ nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3120 return nfserr; 3115 return nfserr;
3121} 3116}
3122 3117
3123__be32 3118static __be32
3124nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr, 3119nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3125 struct nfsd4_sequence *seq) 3120 struct nfsd4_sequence *seq)
3126{ 3121{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 55c8e63af0be..90c6aa6d5e0f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -344,7 +344,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
344 * which clients virtually always use auth_sys for, 344 * which clients virtually always use auth_sys for,
345 * even while using RPCSEC_GSS for NFS. 345 * even while using RPCSEC_GSS for NFS.
346 */ 346 */
347 if (access & NFSD_MAY_LOCK) 347 if (access & NFSD_MAY_LOCK || access & NFSD_MAY_BYPASS_GSS)
348 goto skip_pseudoflavor_check; 348 goto skip_pseudoflavor_check;
349 /* 349 /*
350 * Clients may expect to be able to use auth_sys during mount, 350 * Clients may expect to be able to use auth_sys during mount,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 129f3c9f62d5..d5718273bb32 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -181,16 +181,10 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
181 struct svc_export *exp; 181 struct svc_export *exp;
182 struct dentry *dparent; 182 struct dentry *dparent;
183 struct dentry *dentry; 183 struct dentry *dentry;
184 __be32 err;
185 int host_err; 184 int host_err;
186 185
187 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 186 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
188 187
189 /* Obtain dentry and export. */
190 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
191 if (err)
192 return err;
193
194 dparent = fhp->fh_dentry; 188 dparent = fhp->fh_dentry;
195 exp = fhp->fh_export; 189 exp = fhp->fh_export;
196 exp_get(exp); 190 exp_get(exp);
@@ -254,6 +248,9 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
254 struct dentry *dentry; 248 struct dentry *dentry;
255 __be32 err; 249 __be32 err;
256 250
251 err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
252 if (err)
253 return err;
257 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry); 254 err = nfsd_lookup_dentry(rqstp, fhp, name, len, &exp, &dentry);
258 if (err) 255 if (err)
259 return err; 256 return err;
@@ -877,13 +874,11 @@ static __be32
877nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 874nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
878 loff_t offset, struct kvec *vec, int vlen, unsigned long *count) 875 loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
879{ 876{
880 struct inode *inode;
881 mm_segment_t oldfs; 877 mm_segment_t oldfs;
882 __be32 err; 878 __be32 err;
883 int host_err; 879 int host_err;
884 880
885 err = nfserr_perm; 881 err = nfserr_perm;
886 inode = file->f_path.dentry->d_inode;
887 882
888 if (file->f_op->splice_read && rqstp->rq_splice_ok) { 883 if (file->f_op->splice_read && rqstp->rq_splice_ok) {
889 struct splice_desc sd = { 884 struct splice_desc sd = {
@@ -1340,11 +1335,18 @@ out_nfserr:
1340} 1335}
1341 1336
1342#ifdef CONFIG_NFSD_V3 1337#ifdef CONFIG_NFSD_V3
1338
1339static inline int nfsd_create_is_exclusive(int createmode)
1340{
1341 return createmode == NFS3_CREATE_EXCLUSIVE
1342 || createmode == NFS4_CREATE_EXCLUSIVE4_1;
1343}
1344
1343/* 1345/*
1344 * NFSv3 version of nfsd_create 1346 * NFSv3 and NFSv4 version of nfsd_create
1345 */ 1347 */
1346__be32 1348__be32
1347nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, 1349do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1348 char *fname, int flen, struct iattr *iap, 1350 char *fname, int flen, struct iattr *iap,
1349 struct svc_fh *resfhp, int createmode, u32 *verifier, 1351 struct svc_fh *resfhp, int createmode, u32 *verifier,
1350 int *truncp, int *created) 1352 int *truncp, int *created)
@@ -1396,7 +1398,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1396 if (err) 1398 if (err)
1397 goto out; 1399 goto out;
1398 1400
1399 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1401 if (nfsd_create_is_exclusive(createmode)) {
1400 /* solaris7 gets confused (bugid 4218508) if these have 1402 /* solaris7 gets confused (bugid 4218508) if these have
1401 * the high bit set, so just clear the high bits. If this is 1403 * the high bit set, so just clear the high bits. If this is
1402 * ever changed to use different attrs for storing the 1404 * ever changed to use different attrs for storing the
@@ -1437,6 +1439,11 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1437 && dchild->d_inode->i_atime.tv_sec == v_atime 1439 && dchild->d_inode->i_atime.tv_sec == v_atime
1438 && dchild->d_inode->i_size == 0 ) 1440 && dchild->d_inode->i_size == 0 )
1439 break; 1441 break;
1442 case NFS4_CREATE_EXCLUSIVE4_1:
1443 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1444 && dchild->d_inode->i_atime.tv_sec == v_atime
1445 && dchild->d_inode->i_size == 0 )
1446 goto set_attr;
1440 /* fallthru */ 1447 /* fallthru */
1441 case NFS3_CREATE_GUARDED: 1448 case NFS3_CREATE_GUARDED:
1442 err = nfserr_exist; 1449 err = nfserr_exist;
@@ -1455,7 +1462,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1455 1462
1456 nfsd_check_ignore_resizing(iap); 1463 nfsd_check_ignore_resizing(iap);
1457 1464
1458 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1465 if (nfsd_create_is_exclusive(createmode)) {
1459 /* Cram the verifier into atime/mtime */ 1466 /* Cram the verifier into atime/mtime */
1460 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1467 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
1461 | ATTR_MTIME_SET|ATTR_ATIME_SET; 1468 | ATTR_MTIME_SET|ATTR_ATIME_SET;
@@ -2034,7 +2041,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
2034 struct inode *inode = dentry->d_inode; 2041 struct inode *inode = dentry->d_inode;
2035 int err; 2042 int err;
2036 2043
2037 if (acc == NFSD_MAY_NOP) 2044 if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP)
2038 return 0; 2045 return 0;
2039#if 0 2046#if 0
2040 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n", 2047 dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 9a370a5e36b7..e0bbac04d1dd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -17,10 +17,14 @@
17#define NFSD_MAY_SATTR 8 17#define NFSD_MAY_SATTR 8
18#define NFSD_MAY_TRUNC 16 18#define NFSD_MAY_TRUNC 16
19#define NFSD_MAY_LOCK 32 19#define NFSD_MAY_LOCK 32
20#define NFSD_MAY_MASK 63
21
22/* extra hints to permission and open routines: */
20#define NFSD_MAY_OWNER_OVERRIDE 64 23#define NFSD_MAY_OWNER_OVERRIDE 64
21#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/ 24#define NFSD_MAY_LOCAL_ACCESS 128 /* IRIX doing local access check on device special file*/
22#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256 25#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
23#define NFSD_MAY_NOT_BREAK_LEASE 512 26#define NFSD_MAY_NOT_BREAK_LEASE 512
27#define NFSD_MAY_BYPASS_GSS 1024
24 28
25#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE) 29#define NFSD_MAY_CREATE (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
26#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC) 30#define NFSD_MAY_REMOVE (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
@@ -54,7 +58,7 @@ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
54 int type, dev_t rdev, struct svc_fh *res); 58 int type, dev_t rdev, struct svc_fh *res);
55#ifdef CONFIG_NFSD_V3 59#ifdef CONFIG_NFSD_V3
56__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *); 60__be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
57__be32 nfsd_create_v3(struct svc_rqst *, struct svc_fh *, 61__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
58 char *name, int len, struct iattr *attrs, 62 char *name, int len, struct iattr *attrs,
59 struct svc_fh *res, int createmode, 63 struct svc_fh *res, int createmode,
60 u32 *verifier, int *truncp, int *created); 64 u32 *verifier, int *truncp, int *created);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 587f18432832..b954878ad6ce 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -917,7 +917,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
917 * construction. This function can be called both as a single operation 917 * construction. This function can be called both as a single operation
918 * and as a part of indivisible file operations. 918 * and as a part of indivisible file operations.
919 */ 919 */
920void nilfs_dirty_inode(struct inode *inode) 920void nilfs_dirty_inode(struct inode *inode, int flags)
921{ 921{
922 struct nilfs_transaction_info ti; 922 struct nilfs_transaction_info ti;
923 struct nilfs_mdt_info *mdi = NILFS_MDT(inode); 923 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1102a5fbb744..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -334,8 +334,6 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
334 struct nilfs_transaction_info ti; 334 struct nilfs_transaction_info ti;
335 int err; 335 int err;
336 336
337 dentry_unhash(dentry);
338
339 err = nilfs_transaction_begin(dir->i_sb, &ti, 0); 337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
340 if (err) 338 if (err)
341 return err; 339 return err;
@@ -371,9 +369,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
371 struct nilfs_transaction_info ti; 369 struct nilfs_transaction_info ti;
372 int err; 370 int err;
373 371
374 if (new_inode && S_ISDIR(new_inode->i_mode))
375 dentry_unhash(new_dentry);
376
377 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1); 372 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
378 if (unlikely(err)) 373 if (unlikely(err))
379 return err; 374 return err;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index a9c6a531f80c..f02b9ad43a21 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -269,7 +269,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
269extern int nilfs_inode_dirty(struct inode *); 269extern int nilfs_inode_dirty(struct inode *);
270int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); 270int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
271extern int nilfs_mark_inode_dirty(struct inode *); 271extern int nilfs_mark_inode_dirty(struct inode *);
272extern void nilfs_dirty_inode(struct inode *); 272extern void nilfs_dirty_inode(struct inode *, int flags);
273int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 273int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
274 __u64 start, __u64 len); 274 __u64 start, __u64 len);
275 275
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index c368360c35a1..3b8d3979e03b 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -241,11 +241,9 @@ static int omfs_remove(struct inode *dir, struct dentry *dentry)
241 int ret; 241 int ret;
242 242
243 243
244 if (S_ISDIR(inode->i_mode)) { 244 if (S_ISDIR(inode->i_mode) &&
245 dentry_unhash(dentry); 245 !omfs_dir_is_empty(inode))
246 if (!omfs_dir_is_empty(inode)) 246 return -ENOTEMPTY;
247 return -ENOTEMPTY;
248 }
249 247
250 ret = omfs_delete_entry(dentry); 248 ret = omfs_delete_entry(dentry);
251 if (ret) 249 if (ret)
@@ -382,9 +380,6 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
382 int err; 380 int err;
383 381
384 if (new_inode) { 382 if (new_inode) {
385 if (S_ISDIR(new_inode->i_mode))
386 dentry_unhash(new_dentry);
387
388 /* overwriting existing file/dir */ 383 /* overwriting existing file/dir */
389 err = omfs_remove(new_dir, new_dentry); 384 err = omfs_remove(new_dir, new_dentry);
390 if (err) 385 if (err)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4ede550517a6..14def991d9dd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,9 @@
83#include <linux/pid_namespace.h> 83#include <linux/pid_namespace.h>
84#include <linux/fs_struct.h> 84#include <linux/fs_struct.h>
85#include <linux/slab.h> 85#include <linux/slab.h>
86#ifdef CONFIG_HARDWALL
87#include <asm/hardwall.h>
88#endif
86#include "internal.h" 89#include "internal.h"
87 90
88/* NOTE: 91/* NOTE:
@@ -2842,6 +2845,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2842#ifdef CONFIG_TASK_IO_ACCOUNTING 2845#ifdef CONFIG_TASK_IO_ACCOUNTING
2843 INF("io", S_IRUGO, proc_tgid_io_accounting), 2846 INF("io", S_IRUGO, proc_tgid_io_accounting),
2844#endif 2847#endif
2848#ifdef CONFIG_HARDWALL
2849 INF("hardwall", S_IRUGO, proc_pid_hardwall),
2850#endif
2845}; 2851};
2846 2852
2847static int proc_tgid_base_readdir(struct file * filp, 2853static int proc_tgid_base_readdir(struct file * filp,
@@ -3181,6 +3187,9 @@ static const struct pid_entry tid_base_stuff[] = {
3181#ifdef CONFIG_TASK_IO_ACCOUNTING 3187#ifdef CONFIG_TASK_IO_ACCOUNTING
3182 INF("io", S_IRUGO, proc_tid_io_accounting), 3188 INF("io", S_IRUGO, proc_tid_io_accounting),
3183#endif 3189#endif
3190#ifdef CONFIG_HARDWALL
3191 INF("hardwall", S_IRUGO, proc_pid_hardwall),
3192#endif
3184}; 3193};
3185 3194
3186static int proc_tid_base_readdir(struct file * filp, 3195static int proc_tid_base_readdir(struct file * filp,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 76c8164d5651..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -831,8 +831,6 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
831 INITIALIZE_PATH(path); 831 INITIALIZE_PATH(path);
832 struct reiserfs_dir_entry de; 832 struct reiserfs_dir_entry de;
833 833
834 dentry_unhash(dentry);
835
836 /* we will be doing 2 balancings and update 2 stat data, we change quotas 834 /* we will be doing 2 balancings and update 2 stat data, we change quotas
837 * of the owner of the directory and of the owner of the parent directory. 835 * of the owner of the directory and of the owner of the parent directory.
838 * The quota structure is possibly deleted only on last iput => outside 836 * The quota structure is possibly deleted only on last iput => outside
@@ -1227,9 +1225,6 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
1227 unsigned long savelink = 1; 1225 unsigned long savelink = 1;
1228 struct timespec ctime; 1226 struct timespec ctime;
1229 1227
1230 if (new_dentry->d_inode && S_ISDIR(new_dentry->d_inode->i_mode))
1231 dentry_unhash(new_dentry);
1232
1233 /* three balancings: (1) old name removal, (2) new name insertion 1228 /* three balancings: (1) old name removal, (2) new name insertion
1234 and (3) maybe "save" link insertion 1229 and (3) maybe "save" link insertion
1235 stat data updates: (1) old directory, 1230 stat data updates: (1) old directory,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b216ff6be1c9..aa91089162cb 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -568,7 +568,7 @@ static void destroy_inodecache(void)
568} 568}
569 569
570/* we don't mark inodes dirty, we just log them */ 570/* we don't mark inodes dirty, we just log them */
571static void reiserfs_dirty_inode(struct inode *inode) 571static void reiserfs_dirty_inode(struct inode *inode, int flags)
572{ 572{
573 struct reiserfs_transaction_handle th; 573 struct reiserfs_transaction_handle th;
574 574
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 50f1abccd1cd..e8a62f41b458 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -98,7 +98,6 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
98 98
99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, 99 reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
100 I_MUTEX_CHILD, dir->i_sb); 100 I_MUTEX_CHILD, dir->i_sb);
101 dentry_unhash(dentry);
102 error = dir->i_op->rmdir(dir, dentry); 101 error = dir->i_op->rmdir(dir, dentry);
103 if (!error) 102 if (!error)
104 dentry->d_inode->i_flags |= S_DEAD; 103 dentry->d_inode->i_flags |= S_DEAD;
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 730c56248c9b..5e1101ff276f 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -147,7 +147,7 @@ __le64 *squashfs_read_inode_lookup_table(struct super_block *sb,
147 * table[0] points to the first inode lookup table metadata block, 147 * table[0] points to the first inode lookup table metadata block,
148 * this should be less than lookup_table_start 148 * this should be less than lookup_table_start
149 */ 149 */
150 if (!IS_ERR(table) && table[0] >= lookup_table_start) { 150 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= lookup_table_start) {
151 kfree(table); 151 kfree(table);
152 return ERR_PTR(-EINVAL); 152 return ERR_PTR(-EINVAL);
153 } 153 }
diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c
index 1516a6490bfb..0ed6edbc5c71 100644
--- a/fs/squashfs/fragment.c
+++ b/fs/squashfs/fragment.c
@@ -90,7 +90,7 @@ __le64 *squashfs_read_fragment_index_table(struct super_block *sb,
90 * table[0] points to the first fragment table metadata block, this 90 * table[0] points to the first fragment table metadata block, this
91 * should be less than fragment_table_start 91 * should be less than fragment_table_start
92 */ 92 */
93 if (!IS_ERR(table) && table[0] >= fragment_table_start) { 93 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= fragment_table_start) {
94 kfree(table); 94 kfree(table);
95 return ERR_PTR(-EINVAL); 95 return ERR_PTR(-EINVAL);
96 } 96 }
diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c
index a70858e0fb44..d38ea3dab951 100644
--- a/fs/squashfs/id.c
+++ b/fs/squashfs/id.c
@@ -93,7 +93,7 @@ __le64 *squashfs_read_id_index_table(struct super_block *sb,
93 * table[0] points to the first id lookup table metadata block, this 93 * table[0] points to the first id lookup table metadata block, this
94 * should be less than id_table_start 94 * should be less than id_table_start
95 */ 95 */
96 if (!IS_ERR(table) && table[0] >= id_table_start) { 96 if (!IS_ERR(table) && le64_to_cpu(table[0]) >= id_table_start) {
97 kfree(table); 97 kfree(table);
98 return ERR_PTR(-EINVAL); 98 return ERR_PTR(-EINVAL);
99 } 99 }
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 6f26abee3597..7438850c62d0 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -245,7 +245,7 @@ allocate_id_index_table:
245 msblk->id_table = NULL; 245 msblk->id_table = NULL;
246 goto failed_mount; 246 goto failed_mount;
247 } 247 }
248 next_table = msblk->id_table[0]; 248 next_table = le64_to_cpu(msblk->id_table[0]);
249 249
250 /* Handle inode lookup table */ 250 /* Handle inode lookup table */
251 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 251 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
@@ -261,7 +261,7 @@ allocate_id_index_table:
261 msblk->inode_lookup_table = NULL; 261 msblk->inode_lookup_table = NULL;
262 goto failed_mount; 262 goto failed_mount;
263 } 263 }
264 next_table = msblk->inode_lookup_table[0]; 264 next_table = le64_to_cpu(msblk->inode_lookup_table[0]);
265 265
266 sb->s_export_op = &squashfs_export_ops; 266 sb->s_export_op = &squashfs_export_ops;
267 267
@@ -286,7 +286,7 @@ handle_fragments:
286 msblk->fragment_index = NULL; 286 msblk->fragment_index = NULL;
287 goto failed_mount; 287 goto failed_mount;
288 } 288 }
289 next_table = msblk->fragment_index[0]; 289 next_table = le64_to_cpu(msblk->fragment_index[0]);
290 290
291check_directory_table: 291check_directory_table:
292 /* Sanity check directory_table */ 292 /* Sanity check directory_table */
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index e2cc6756f3b1..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -196,8 +196,6 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
196 struct inode *inode = dentry->d_inode; 196 struct inode *inode = dentry->d_inode;
197 int err = -ENOTEMPTY; 197 int err = -ENOTEMPTY;
198 198
199 dentry_unhash(dentry);
200
201 if (sysv_empty_dir(inode)) { 199 if (sysv_empty_dir(inode)) {
202 err = sysv_unlink(dir, dentry); 200 err = sysv_unlink(dir, dentry);
203 if (!err) { 201 if (!err) {
@@ -224,9 +222,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
224 struct sysv_dir_entry * old_de; 222 struct sysv_dir_entry * old_de;
225 int err = -ENOENT; 223 int err = -ENOENT;
226 224
227 if (new_inode && S_ISDIR(new_inode->i_mode))
228 dentry_unhash(new_dentry);
229
230 old_de = sysv_find_entry(old_dentry, &old_page); 225 old_de = sysv_find_entry(old_dentry, &old_page);
231 if (!old_de) 226 if (!old_de)
232 goto out; 227 goto out;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index c2b80943560d..ef5abd38f0bf 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -656,8 +656,6 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
656 struct ubifs_inode *dir_ui = ubifs_inode(dir); 656 struct ubifs_inode *dir_ui = ubifs_inode(dir);
657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; 657 struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
658 658
659 dentry_unhash(dentry);
660
661 /* 659 /*
662 * Budget request settings: deletion direntry, deletion inode and 660 * Budget request settings: deletion direntry, deletion inode and
663 * changing the parent inode. If budgeting fails, go ahead anyway 661 * changing the parent inode. If budgeting fails, go ahead anyway
@@ -978,9 +976,6 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
978 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; 976 .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
979 struct timespec time; 977 struct timespec time;
980 978
981 if (new_inode && S_ISDIR(new_inode->i_mode))
982 dentry_unhash(new_dentry);
983
984 /* 979 /*
985 * Budget request settings: deletion direntry, new direntry, removing 980 * Budget request settings: deletion direntry, new direntry, removing
986 * the old inode, and changing old and new parent directory inodes. 981 * the old inode, and changing old and new parent directory inodes.
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 46961c003236..ca953a945029 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,8 +277,9 @@ static int kick_a_thread(void)
277 return 0; 277 return 0;
278} 278}
279 279
280int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) 280int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
281{ 281{
282 int nr = sc->nr_to_scan;
282 int freed, contention = 0; 283 int freed, contention = 0;
283 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); 284 long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
284 285
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6db0bdaa9f74..1ab0d22e4c94 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -382,7 +382,7 @@ done:
382 end_writeback(inode); 382 end_writeback(inode);
383} 383}
384 384
385static void ubifs_dirty_inode(struct inode *inode) 385static void ubifs_dirty_inode(struct inode *inode, int flags)
386{ 386{
387 struct ubifs_inode *ui = ubifs_inode(inode); 387 struct ubifs_inode *ui = ubifs_inode(inode);
388 388
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 93d1412a06f0..a70d7b4ffb25 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1614,7 +1614,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
1614int ubifs_tnc_end_commit(struct ubifs_info *c); 1614int ubifs_tnc_end_commit(struct ubifs_info *c);
1615 1615
1616/* shrinker.c */ 1616/* shrinker.c */
1617int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); 1617int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
1618 1618
1619/* commit.c */ 1619/* commit.c */
1620int ubifs_bg_thread(void *info); 1620int ubifs_bg_thread(void *info);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 4d76594c2a8f..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -783,8 +783,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
783 struct fileIdentDesc *fi, cfi; 783 struct fileIdentDesc *fi, cfi;
784 struct kernel_lb_addr tloc; 784 struct kernel_lb_addr tloc;
785 785
786 dentry_unhash(dentry);
787
788 retval = -ENOENT; 786 retval = -ENOENT;
789 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 787 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
790 if (!fi) 788 if (!fi)
@@ -1083,9 +1081,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1083 struct kernel_lb_addr tloc; 1081 struct kernel_lb_addr tloc;
1084 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1082 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1085 1083
1086 if (new_inode && S_ISDIR(new_inode->i_mode))
1087 dentry_unhash(new_dentry);
1088
1089 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1084 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1090 if (ofi) { 1085 if (ofi) {
1091 if (ofibh.sbh != ofibh.ebh) 1086 if (ofibh.sbh != ofibh.ebh)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 953ebdfc5bf7..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -258,8 +258,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
258 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
259 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
260 260
261 dentry_unhash(dentry);
262
263 lock_ufs(dir->i_sb); 261 lock_ufs(dir->i_sb);
264 if (ufs_empty_dir (inode)) { 262 if (ufs_empty_dir (inode)) {
265 err = ufs_unlink(dir, dentry); 263 err = ufs_unlink(dir, dentry);
@@ -284,9 +282,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
284 struct ufs_dir_entry *old_de; 282 struct ufs_dir_entry *old_de;
285 int err = -ENOENT; 283 int err = -ENOENT;
286 284
287 if (new_inode && S_ISDIR(new_inode->i_mode))
288 dentry_unhash(new_dentry);
289
290 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 285 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
291 if (!old_de) 286 if (!old_de)
292 goto out; 287 goto out;
diff --git a/fs/xattr.c b/fs/xattr.c
index f1ef94974dea..f060663ab70c 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -46,18 +46,22 @@ xattr_permission(struct inode *inode, const char *name, int mask)
46 return 0; 46 return 0;
47 47
48 /* 48 /*
49 * The trusted.* namespace can only be accessed by a privileged user. 49 * The trusted.* namespace can only be accessed by privileged users.
50 */ 50 */
51 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) 51 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
52 return (capable(CAP_SYS_ADMIN) ? 0 : -EPERM); 52 if (!capable(CAP_SYS_ADMIN))
53 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
54 return 0;
55 }
53 56
54 /* In user.* namespace, only regular files and directories can have 57 /*
58 * In the user.* namespace, only regular files and directories can have
55 * extended attributes. For sticky directories, only the owner and 59 * extended attributes. For sticky directories, only the owner and
56 * privileged user can write attributes. 60 * privileged users can write attributes.
57 */ 61 */
58 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { 62 if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 63 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
60 return -EPERM; 64 return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && 65 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
62 (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) 66 (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
63 return -EPERM; 67 return -EPERM;
@@ -87,7 +91,11 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
87{ 91{
88 struct inode *inode = dentry->d_inode; 92 struct inode *inode = dentry->d_inode;
89 int error = -EOPNOTSUPP; 93 int error = -EOPNOTSUPP;
94 int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
95 XATTR_SECURITY_PREFIX_LEN);
90 96
97 if (issec)
98 inode->i_flags &= ~S_NOSEC;
91 if (inode->i_op->setxattr) { 99 if (inode->i_op->setxattr) {
92 error = inode->i_op->setxattr(dentry, name, value, size, flags); 100 error = inode->i_op->setxattr(dentry, name, value, size, flags);
93 if (!error) { 101 if (!error) {
@@ -95,8 +103,7 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
95 security_inode_post_setxattr(dentry, name, value, 103 security_inode_post_setxattr(dentry, name, value,
96 size, flags); 104 size, flags);
97 } 105 }
98 } else if (!strncmp(name, XATTR_SECURITY_PREFIX, 106 } else if (issec) {
99 XATTR_SECURITY_PREFIX_LEN)) {
100 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; 107 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 error = security_inode_setsecurity(inode, suffix, value, 108 error = security_inode_setsecurity(inode, suffix, value,
102 size, flags); 109 size, flags);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 98b9c91fcdf1..1e3a7ce804dc 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -925,7 +925,8 @@ xfs_fs_inode_init_once(
925 */ 925 */
926STATIC void 926STATIC void
927xfs_fs_dirty_inode( 927xfs_fs_dirty_inode(
928 struct inode *inode) 928 struct inode *inode,
929 int flags)
929{ 930{
930 barrier(); 931 barrier();
931 XFS_I(inode)->i_update_core = 1; 932 XFS_I(inode)->i_update_core = 1;