aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2009-09-20 08:55:36 -0400
committerDavid Woodhouse <David.Woodhouse@intel.com>2009-09-20 08:55:36 -0400
commit6469f540ea37d53db089c8fea9c0c77a3d9353d4 (patch)
tree1dc9dc077150d57f4424cae49e711b5dd6e903a1 /fs
parent304e6d5fe294b80e6d3107f99ec241816390ebcc (diff)
parent78f28b7c555359c67c2a0d23f7436e915329421e (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/mtd/mtdcore.c Merged in order that I can apply the Nomadik nand/onenand support patches.
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/binfmt_elf.c28
-rw-r--r--fs/block_dev.c30
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c3
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/volumes.c4
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/char_dev.c40
-rw-r--r--fs/cifs/CHANGES5
-rw-r--r--fs/cifs/cifs_spnego.c2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c1
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h21
-rw-r--r--fs/cifs/cifssmb.c316
-rw-r--r--fs/cifs/connect.c49
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c43
-rw-r--r--fs/cifs/inode.c6
-rw-r--r--fs/cifs/transport.c17
-rw-r--r--fs/compat.c17
-rw-r--r--fs/configfs/inode.c1
-rw-r--r--fs/dcache.c1
-rw-r--r--fs/dlm/lowcomms.c26
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/exec.c63
-rw-r--r--fs/ext2/acl.c8
-rw-r--r--fs/ext2/acl.h4
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c8
-rw-r--r--fs/ext3/acl.c8
-rw-r--r--fs/ext3/acl.h4
-rw-r--r--fs/ext3/file.c63
-rw-r--r--fs/ext3/fsync.c12
-rw-r--r--fs/ext3/inode.c28
-rw-r--r--fs/ext3/namei.c4
-rw-r--r--fs/ext4/Kconfig11
-rw-r--r--fs/ext4/acl.c8
-rw-r--r--fs/ext4/acl.h4
-rw-r--r--fs/ext4/balloc.c2
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h4
-rw-r--r--fs/ext4/ext4_jbd2.c9
-rw-r--r--fs/ext4/extents.c112
-rw-r--r--fs/ext4/file.c55
-rw-r--r--fs/ext4/fsync.c13
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c150
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c429
-rw-r--r--fs/ext4/mballoc.h22
-rw-r--r--fs/ext4/migrate.c22
-rw-r--r--fs/ext4/move_extent.c334
-rw-r--r--fs/ext4/namei.c26
-rw-r--r--fs/ext4/resize.c7
-rw-r--r--fs/ext4/super.c155
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/fat/file.c22
-rw-r--r--fs/fat/misc.c4
-rw-r--r--fs/fs-writeback.c1104
-rw-r--r--fs/fuse/control.c138
-rw-r--r--fs/fuse/dev.c10
-rw-r--r--fs/fuse/fuse_i.h18
-rw-r--r--fs/fuse/inode.c83
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c106
-rw-r--r--fs/gfs2/dentry.c18
-rw-r--r--fs/gfs2/eaops.c157
-rw-r--r--fs/gfs2/eaops.h30
-rw-r--r--fs/gfs2/export.c36
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/gfs2/incore.h15
-rw-r--r--fs/gfs2/inode.c159
-rw-r--r--fs/gfs2/ops_fstype.c66
-rw-r--r--fs/gfs2/ops_inode.c82
-rw-r--r--fs/gfs2/rgrp.c88
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c46
-rw-r--r--fs/gfs2/super.h5
-rw-r--r--fs/gfs2/sys.c31
-rw-r--r--fs/gfs2/util.c41
-rw-r--r--fs/gfs2/xattr.c (renamed from fs/gfs2/eattr.c)425
-rw-r--r--fs/gfs2/xattr.h (renamed from fs/gfs2/eattr.h)54
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/inode.c4
-rw-r--r--fs/jbd/checkpoint.c6
-rw-r--r--fs/jbd/commit.c2
-rw-r--r--fs/jbd/journal.c30
-rw-r--r--fs/jbd/recovery.c18
-rw-r--r--fs/jbd/revoke.c16
-rw-r--r--fs/jbd/transaction.c9
-rw-r--r--fs/jbd2/commit.c12
-rw-r--r--fs/jbd2/journal.c6
-rw-r--r--fs/jbd2/transaction.c7
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/symlink.c2
-rw-r--r--fs/jfs/acl.c7
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/lockd/host.c14
-rw-r--r--fs/lockd/mon.c44
-rw-r--r--fs/locks.c4
-rw-r--r--fs/namei.c110
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/cache_lib.c140
-rw-r--r--fs/nfs/cache_lib.h27
-rw-r--r--fs/nfs/callback.c26
-rw-r--r--fs/nfs/client.c16
-rw-r--r--fs/nfs/direct.c3
-rw-r--r--fs/nfs/dns_resolve.c335
-rw-r--r--fs/nfs/dns_resolve.h14
-rw-r--r--fs/nfs/file.c49
-rw-r--r--fs/nfs/idmap.c6
-rw-r--r--fs/nfs/inode.c100
-rw-r--r--fs/nfs/internal.h39
-rw-r--r--fs/nfs/mount_clnt.c83
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs4namespace.c24
-rw-r--r--fs/nfs/nfs4proc.c40
-rw-r--r--fs/nfs/nfs4xdr.c1460
-rw-r--r--fs/nfs/super.c453
-rw-r--r--fs/nfs/write.c92
-rw-r--r--fs/nfsd/auth.c4
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfsctl.c21
-rw-r--r--fs/nfsd/nfssvc.c2
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/nilfs2/Kconfig2
-rw-r--r--fs/nilfs2/bmap.c151
-rw-r--r--fs/nilfs2/bmap.h76
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/btree.c625
-rw-r--r--fs/nilfs2/cpfile.c11
-rw-r--r--fs/nilfs2/cpfile.h2
-rw-r--r--fs/nilfs2/dat.c42
-rw-r--r--fs/nilfs2/dat.h8
-rw-r--r--fs/nilfs2/direct.c161
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c3
-rw-r--r--fs/nilfs2/ioctl.c26
-rw-r--r--fs/nilfs2/mdt.c40
-rw-r--r--fs/nilfs2/mdt.h3
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segbuf.c4
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c100
-rw-r--r--fs/nilfs2/the_nilfs.c19
-rw-r--r--fs/nilfs2/the_nilfs.h43
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/mft.c13
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/dcache.c11
-rw-r--r--fs/ocfs2/dlm/dlmfs.c1
-rw-r--r--fs/ocfs2/file.c49
-rw-r--r--fs/open.c12
-rw-r--r--fs/partitions/check.c14
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/splice.c30
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c85
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/inode.c135
-rw-r--r--fs/sysfs/symlink.c2
-rw-r--r--fs/sysfs/sysfs.h12
-rw-r--r--fs/ubifs/budget.c32
-rw-r--r--fs/ubifs/super.c10
-rw-r--r--fs/udf/directory.c86
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c19
-rw-r--r--fs/udf/lowlevel.c4
-rw-r--r--fs/udf/namei.c1
-rw-r--r--fs/xattr.c55
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c19
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c17
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c51
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c24
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h1
-rw-r--r--fs/xfs/quota/xfs_qm_stats.c78
-rw-r--r--fs/xfs/xfs_ag.h9
-rw-r--r--fs/xfs/xfs_bmap.c2
-rw-r--r--fs/xfs/xfs_bmap.h11
-rw-r--r--fs/xfs/xfs_bmap_btree.c20
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c42
-rw-r--r--fs/xfs/xfs_btree.h15
-rw-r--r--fs/xfs/xfs_ialloc.c805
-rw-r--r--fs/xfs/xfs_ialloc.h18
-rw-r--r--fs/xfs/xfs_iget.c27
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--fs/xfs/xfs_inode.h8
-rw-r--r--fs/xfs/xfs_inode_item.c10
-rw-r--r--fs/xfs/xfs_inode_item.h2
-rw-r--r--fs/xfs/xfs_inum.h1
-rw-r--r--fs/xfs/xfs_itable.c98
-rw-r--r--fs/xfs/xfs_itable.h5
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c2
-rw-r--r--fs/xfs/xfs_mount.c2
-rw-r--r--fs/xfs/xfs_mount.h3
-rw-r--r--fs/xfs/xfs_mru_cache.c29
-rw-r--r--fs/xfs/xfs_mru_cache.h1
-rw-r--r--fs/xfs/xfs_rw.c84
-rw-r--r--fs/xfs/xfs_rw.h7
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c86
-rw-r--r--fs/xfs/xfs_vnodeops.c17
221 files changed, 6531 insertions, 5310 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
43source "fs/gfs2/Kconfig" 43source "fs/gfs2/Kconfig"
44source "fs/ocfs2/Kconfig" 44source "fs/ocfs2/Kconfig"
45source "fs/btrfs/Kconfig" 45source "fs/btrfs/Kconfig"
46source "fs/nilfs2/Kconfig"
46 47
47endif # BLOCK 48endif # BLOCK
48 49
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
186source "fs/sysv/Kconfig" 187source "fs/sysv/Kconfig"
187source "fs/ufs/Kconfig" 188source "fs/ufs/Kconfig"
188source "fs/exofs/Kconfig" 189source "fs/exofs/Kconfig"
189source "fs/nilfs2/Kconfig"
190 190
191endif # MISC_FILESYSTEMS 191endif # MISC_FILESYSTEMS
192 192
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
712 .bdi = mapping->backing_dev_info, 712 .bdi = mapping->backing_dev_info,
713 .sync_mode = WB_SYNC_ALL, 713 .sync_mode = WB_SYNC_ALL,
714 .nr_to_write = LONG_MAX, 714 .nr_to_write = LONG_MAX,
715 .for_writepages = 1,
716 .range_cyclic = 1, 715 .range_cyclic = 1,
717 }; 716 };
718 int ret; 717 int ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
501 } 501 }
502 } 502 }
503 503
504 /* 504 if (last_bss > elf_bss) {
505 * Now fill out the bss section. First pad the last page up 505 /*
506 * to the page boundary, and then perform a mmap to make sure 506 * Now fill out the bss section. First pad the last page up
507 * that there are zero-mapped pages up to and including the 507 * to the page boundary, and then perform a mmap to make sure
508 * last bss page. 508 * that there are zero-mapped pages up to and including the
509 */ 509 * last bss page.
510 if (padzero(elf_bss)) { 510 */
511 error = -EFAULT; 511 if (padzero(elf_bss)) {
512 goto out_close; 512 error = -EFAULT;
513 } 513 goto out_close;
514 }
514 515
515 /* What we have mapped so far */ 516 /* What we have mapped so far */
516 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); 517 elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
517 518
518 /* Map the last of the bss segment */ 519 /* Map the last of the bss segment */
519 if (last_bss > elf_bss) {
520 down_write(&current->mm->mmap_sem); 520 down_write(&current->mm->mmap_sem);
521 error = do_brk(elf_bss, last_bss - elf_bss); 521 error = do_brk(elf_bss, last_bss - elf_bss);
522 up_write(&current->mm->mmap_sem); 522 up_write(&current->mm->mmap_sem);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 94dfda24c06e..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
420{ 420{
421 struct bdev_inode *bdi = BDEV_I(inode); 421 struct bdev_inode *bdi = BDEV_I(inode);
422 422
423 bdi->bdev.bd_inode_backing_dev_info = NULL;
424 kmem_cache_free(bdev_cachep, bdi); 423 kmem_cache_free(bdev_cachep, bdi);
425} 424}
426 425
@@ -1405,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
1405} 1404}
1406 1405
1407/* 1406/*
1407 * Write data to the block device. Only intended for the block device itself
1408 * and the raw driver which basically is a fake block device.
1409 *
1410 * Does not take i_mutex for the write and thus is not for general purpose
1411 * use.
1412 */
1413ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
1414 unsigned long nr_segs, loff_t pos)
1415{
1416 struct file *file = iocb->ki_filp;
1417 ssize_t ret;
1418
1419 BUG_ON(iocb->ki_pos != pos);
1420
1421 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
1422 if (ret > 0 || ret == -EIOCBQUEUED) {
1423 ssize_t err;
1424
1425 err = generic_write_sync(file, pos, ret);
1426 if (err < 0 && ret > 0)
1427 ret = err;
1428 }
1429 return ret;
1430}
1431EXPORT_SYMBOL_GPL(blkdev_aio_write);
1432
1433/*
1408 * Try to release a page associated with block device when the system 1434 * Try to release a page associated with block device when the system
1409 * is under memory pressure. 1435 * is under memory pressure.
1410 */ 1436 */
@@ -1436,7 +1462,7 @@ const struct file_operations def_blk_fops = {
1436 .read = do_sync_read, 1462 .read = do_sync_read,
1437 .write = do_sync_write, 1463 .write = do_sync_write,
1438 .aio_read = generic_file_aio_read, 1464 .aio_read = generic_file_aio_read,
1439 .aio_write = generic_file_aio_write_nolock, 1465 .aio_write = blkdev_aio_write,
1440 .mmap = generic_file_mmap, 1466 .mmap = generic_file_mmap,
1441 .fsync = block_fsync, 1467 .fsync = block_fsync,
1442 .unlocked_ioctl = block_ioctl, 1468 .unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e83be2e4602c..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1352{ 1352{
1353 int err; 1353 int err;
1354 1354
1355 bdi->name = "btrfs";
1355 bdi->capabilities = BDI_CAP_MAP_COPY; 1356 bdi->capabilities = BDI_CAP_MAP_COPY;
1356 err = bdi_init(bdi); 1357 err = bdi_init(bdi);
1357 if (err) 1358 if (err)
@@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1599 1600
1600 sb->s_blocksize = 4096; 1601 sb->s_blocksize = 4096;
1601 sb->s_blocksize_bits = blksize_bits(4096); 1602 sb->s_blocksize_bits = blksize_bits(4096);
1603 sb->s_bdi = &fs_info->bdi;
1602 1604
1603 /* 1605 /*
1604 * we set the i_size on the btree inode to the max possible int. 1606 * we set the i_size on the btree inode to the max possible int.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 72a2b9c28e9f..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1511,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1511static void btrfs_issue_discard(struct block_device *bdev, 1511static void btrfs_issue_discard(struct block_device *bdev,
1512 u64 start, u64 len) 1512 u64 start, u64 len)
1513{ 1513{
1514 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 1514 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1515 DISCARD_FL_BARRIER);
1515} 1516}
1516#endif 1517#endif
1517 1518
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
740 .nr_to_write = mapping->nrpages * 2, 740 .nr_to_write = mapping->nrpages * 2,
741 .range_start = start, 741 .range_start = start,
742 .range_end = end, 742 .range_end = end,
743 .for_writepages = 1,
744 }; 743 };
745 return btrfs_writepages(mapping, &wbc); 744 return btrfs_writepages(mapping, &wbc);
746} 745}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5dbefd11b4af..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
260 num_run++; 260 num_run++;
261 batch_run++; 261 batch_run++;
262 262
263 if (bio_sync(cur)) 263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 264 num_sync_run++;
265 265
266 if (need_resched()) { 266 if (need_resched()) {
@@ -2903,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2903 bio->bi_rw |= rw; 2903 bio->bi_rw |= rw;
2904 2904
2905 spin_lock(&device->io_lock); 2905 spin_lock(&device->io_lock);
2906 if (bio_sync(bio)) 2906 if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
2907 pending_bios = &device->pending_sync_bios; 2907 pending_bios = &device->pending_sync_bios;
2908 else 2908 else
2909 pending_bios = &device->pending_bios; 2909 pending_bios = &device->pending_bios;
diff --git a/fs/buffer.c b/fs/buffer.c
index 28f320fac4d4..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
281 struct zone *zone; 281 struct zone *zone;
282 int nid; 282 int nid;
283 283
284 wakeup_pdflush(1024); 284 wakeup_flusher_threads(1024);
285 yield(); 285 yield();
286 286
287 for_each_online_node(nid) { 287 for_each_online_node(nid) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..3cbc57f932d2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
31 * - no readahead or I/O queue unplugging required 31 * - no readahead or I/O queue unplugging required
32 */ 32 */
33struct backing_dev_info directly_mappable_cdev_bdi = { 33struct backing_dev_info directly_mappable_cdev_bdi = {
34 .name = "char",
34 .capabilities = ( 35 .capabilities = (
35#ifdef CONFIG_MMU 36#ifdef CONFIG_MMU
36 /* permit private copies of the data to be taken */ 37 /* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
237} 238}
238 239
239/** 240/**
240 * register_chrdev() - Register a major number for character devices. 241 * __register_chrdev() - create and register a cdev occupying a range of minors
241 * @major: major device number or 0 for dynamic allocation 242 * @major: major device number or 0 for dynamic allocation
243 * @baseminor: first of the requested range of minor numbers
244 * @count: the number of minor numbers required
242 * @name: name of this range of devices 245 * @name: name of this range of devices
243 * @fops: file operations associated with this devices 246 * @fops: file operations associated with this devices
244 * 247 *
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
254 * /dev. It only helps to keep track of the different owners of devices. If 257 * /dev. It only helps to keep track of the different owners of devices. If
255 * your module name has only one type of devices it's ok to use e.g. the name 258 * your module name has only one type of devices it's ok to use e.g. the name
256 * of the module here. 259 * of the module here.
257 *
258 * This function registers a range of 256 minor numbers. The first minor number
259 * is 0.
260 */ 260 */
261int register_chrdev(unsigned int major, const char *name, 261int __register_chrdev(unsigned int major, unsigned int baseminor,
262 const struct file_operations *fops) 262 unsigned int count, const char *name,
263 const struct file_operations *fops)
263{ 264{
264 struct char_device_struct *cd; 265 struct char_device_struct *cd;
265 struct cdev *cdev; 266 struct cdev *cdev;
266 char *s; 267 char *s;
267 int err = -ENOMEM; 268 int err = -ENOMEM;
268 269
269 cd = __register_chrdev_region(major, 0, 256, name); 270 cd = __register_chrdev_region(major, baseminor, count, name);
270 if (IS_ERR(cd)) 271 if (IS_ERR(cd))
271 return PTR_ERR(cd); 272 return PTR_ERR(cd);
272 273
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
280 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) 281 for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
281 *s = '!'; 282 *s = '!';
282 283
283 err = cdev_add(cdev, MKDEV(cd->major, 0), 256); 284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
284 if (err) 285 if (err)
285 goto out; 286 goto out;
286 287
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
290out: 291out:
291 kobject_put(&cdev->kobj); 292 kobject_put(&cdev->kobj);
292out2: 293out2:
293 kfree(__unregister_chrdev_region(cd->major, 0, 256)); 294 kfree(__unregister_chrdev_region(cd->major, baseminor, count));
294 return err; 295 return err;
295} 296}
296 297
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
316 } 317 }
317} 318}
318 319
319void unregister_chrdev(unsigned int major, const char *name) 320/**
321 * __unregister_chrdev - unregister and destroy a cdev
322 * @major: major device number
323 * @baseminor: first of the range of minor numbers
324 * @count: the number of minor numbers this cdev is occupying
325 * @name: name of this range of devices
326 *
327 * Unregister and destroy the cdev occupying the region described by
328 * @major, @baseminor and @count. This function undoes what
329 * __register_chrdev() did.
330 */
331void __unregister_chrdev(unsigned int major, unsigned int baseminor,
332 unsigned int count, const char *name)
320{ 333{
321 struct char_device_struct *cd; 334 struct char_device_struct *cd;
322 cd = __unregister_chrdev_region(major, 0, 256); 335
336 cd = __unregister_chrdev_region(major, baseminor, count);
323 if (cd && cd->cdev) 337 if (cd && cd->cdev)
324 cdev_del(cd->cdev); 338 cdev_del(cd->cdev);
325 kfree(cd); 339 kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
568EXPORT_SYMBOL(cdev_del); 582EXPORT_SYMBOL(cdev_del);
569EXPORT_SYMBOL(cdev_add); 583EXPORT_SYMBOL(cdev_add);
570EXPORT_SYMBOL(cdev_index); 584EXPORT_SYMBOL(cdev_index);
571EXPORT_SYMBOL(register_chrdev); 585EXPORT_SYMBOL(__register_chrdev);
572EXPORT_SYMBOL(unregister_chrdev); 586EXPORT_SYMBOL(__unregister_chrdev);
573EXPORT_SYMBOL(directly_mappable_cdev_bdi); 587EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index e85b1e4389e0..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -3,7 +3,10 @@ Version 1.60
3Fix memory leak in reconnect. Fix oops in DFS mount error path. 3Fix memory leak in reconnect. Fix oops in DFS mount error path.
4Set s_maxbytes to smaller (the max that vfs can handle) so that 4Set s_maxbytes to smaller (the max that vfs can handle) so that
5sendfile will now work over cifs mounts again. Add noforcegid 5sendfile will now work over cifs mounts again. Add noforcegid
6and noforceuid mount parameters. 6and noforceuid mount parameters. Fix small mem leak when using
7ntlmv2. Fix 2nd mount to same server but with different port to
8be allowed (rather than reusing the 1st port) - only when the
9user explicitly overrides the port on the 2nd mount.
7 10
8Version 1.59 11Version 1.59
9------------ 12------------
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
125 if (server->addr.sockAddr.sin_family == AF_INET) 125 if (server->addr.sockAddr.sin_family == AF_INET)
126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); 126 sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
127 else if (server->addr.sockAddr.sin_family == AF_INET6) 127 else if (server->addr.sockAddr.sin_family == AF_INET6)
128 sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); 128 sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
129 else 129 else
130 goto out; 130 goto out;
131 131
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
607 return get_cifs_acl_by_path(cifs_sb, path, pacllen); 607 return get_cifs_acl_by_path(cifs_sb, path, pacllen);
608 608
609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); 609 pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
610 atomic_dec(&open_file->wrtPending); 610 cifsFileInfo_put(open_file);
611 return pntsd; 611 return pntsd;
612} 612}
613 613
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); 665 return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
666 666
667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); 667 rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
668 atomic_dec(&open_file->wrtPending); 668 cifsFileInfo_put(open_file);
669 return rc; 669 return rc;
670} 670}
671 671
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
373 compare with the NTLM example */ 373 compare with the NTLM example */
374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt); 374 hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
375 375
376 kfree(pctxt);
376 return rc; 377 return rc;
377} 378}
378 379
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 84b75253b05a..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
361static int 361static int
362cifs_show_options(struct seq_file *s, struct vfsmount *m) 362cifs_show_options(struct seq_file *s, struct vfsmount *m)
363{ 363{
364 struct cifs_sb_info *cifs_sb; 364 struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
365 struct cifsTconInfo *tcon; 365 struct cifsTconInfo *tcon = cifs_sb->tcon;
366
367 cifs_sb = CIFS_SB(m->mnt_sb);
368 tcon = cifs_sb->tcon;
369 366
370 seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); 367 seq_printf(s, ",unc=%s", tcon->treeName);
371 if (tcon->ses->userName) 368 if (tcon->ses->userName)
372 seq_printf(s, ",username=%s", tcon->ses->userName); 369 seq_printf(s, ",username=%s", tcon->ses->userName);
373 if (tcon->ses->domainName) 370 if (tcon->ses->domainName)
@@ -989,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
989 if (try_to_freeze()) 986 if (try_to_freeze())
990 continue; 987 continue;
991 988
992 spin_lock(&GlobalMid_Lock); 989 spin_lock(&cifs_oplock_lock);
993 if (list_empty(&GlobalOplock_Q)) { 990 if (list_empty(&cifs_oplock_list)) {
994 spin_unlock(&GlobalMid_Lock); 991 spin_unlock(&cifs_oplock_lock);
995 set_current_state(TASK_INTERRUPTIBLE); 992 set_current_state(TASK_INTERRUPTIBLE);
996 schedule_timeout(39*HZ); 993 schedule_timeout(39*HZ);
997 } else { 994 } else {
998 oplock_item = list_entry(GlobalOplock_Q.next, 995 oplock_item = list_entry(cifs_oplock_list.next,
999 struct oplock_q_entry, qhead); 996 struct oplock_q_entry, qhead);
1000 cFYI(1, ("found oplock item to write out")); 997 cFYI(1, ("found oplock item to write out"));
1001 pTcon = oplock_item->tcon; 998 pTcon = oplock_item->tcon;
1002 inode = oplock_item->pinode; 999 inode = oplock_item->pinode;
1003 netfid = oplock_item->netfid; 1000 netfid = oplock_item->netfid;
1004 spin_unlock(&GlobalMid_Lock); 1001 spin_unlock(&cifs_oplock_lock);
1005 DeleteOplockQEntry(oplock_item); 1002 DeleteOplockQEntry(oplock_item);
1006 /* can not grab inode sem here since it would 1003 /* can not grab inode sem here since it would
1007 deadlock when oplock received on delete 1004 deadlock when oplock received on delete
@@ -1058,7 +1055,7 @@ init_cifs(void)
1058 int rc = 0; 1055 int rc = 0;
1059 cifs_proc_init(); 1056 cifs_proc_init();
1060 INIT_LIST_HEAD(&cifs_tcp_ses_list); 1057 INIT_LIST_HEAD(&cifs_tcp_ses_list);
1061 INIT_LIST_HEAD(&GlobalOplock_Q); 1058 INIT_LIST_HEAD(&cifs_oplock_list);
1062#ifdef CONFIG_CIFS_EXPERIMENTAL 1059#ifdef CONFIG_CIFS_EXPERIMENTAL
1063 INIT_LIST_HEAD(&GlobalDnotifyReqList); 1060 INIT_LIST_HEAD(&GlobalDnotifyReqList);
1064 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); 1061 INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1087,6 +1084,7 @@ init_cifs(void)
1087 rwlock_init(&GlobalSMBSeslock); 1084 rwlock_init(&GlobalSMBSeslock);
1088 rwlock_init(&cifs_tcp_ses_lock); 1085 rwlock_init(&cifs_tcp_ses_lock);
1089 spin_lock_init(&GlobalMid_Lock); 1086 spin_lock_init(&GlobalMid_Lock);
1087 spin_lock_init(&cifs_oplock_lock);
1090 1088
1091 if (cifs_max_pending < 2) { 1089 if (cifs_max_pending < 2) {
1092 cifs_max_pending = 2; 1090 cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
113extern const struct export_operations cifs_export_ops; 113extern const struct export_operations cifs_export_ops;
114#endif /* EXPERIMENTAL */ 114#endif /* EXPERIMENTAL */
115 115
116#define CIFS_VERSION "1.60" 116#define CIFS_VERSION "1.61"
117#endif /* _CIFSFS_H */ 117#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
351 bool closePend:1; /* file is marked to close */ 351 bool closePend:1; /* file is marked to close */
352 bool invalidHandle:1; /* file closed via session abend */ 352 bool invalidHandle:1; /* file closed via session abend */
353 bool messageMode:1; /* for pipes: message vs byte mode */ 353 bool messageMode:1; /* for pipes: message vs byte mode */
354 atomic_t wrtPending; /* handle in use - defer close */ 354 atomic_t count; /* reference count */
355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/ 355 struct mutex fh_mutex; /* prevents reopen race after dead ses*/
356 struct cifs_search_info srch_inf; 356 struct cifs_search_info srch_inf;
357}; 357};
358 358
359/* Take a reference on the file private data */
360static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
361{
362 atomic_inc(&cifs_file->count);
363}
364
365/* Release a reference on the file private data */
366static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
367{
368 if (atomic_dec_and_test(&cifs_file->count))
369 kfree(cifs_file);
370}
371
359/* 372/*
360 * One of these for each file inode 373 * One of these for each file inode
361 */ 374 */
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock;
656 */ 669 */
657GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; 670GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
658 671
659GLOBAL_EXTERN struct list_head GlobalOplock_Q; 672/* Global list of oplocks */
673GLOBAL_EXTERN struct list_head cifs_oplock_list;
674
675/* Protects the cifs_oplock_list */
676GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
660 677
661/* Outstanding dir notify requests */ 678/* Outstanding dir notify requests */
662GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; 679GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
100 to this tcon */ 100 to this tcon */
101} 101}
102 102
103/* Allocate and return pointer to an SMB request buffer, and set basic 103/* reconnect the socket, tcon, and smb session if needed */
104 SMB information in the SMB header. If the return code is zero, this
105 function must have filled in request_buf pointer */
106static int 104static int
107small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, 105cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
108 void **request_buf)
109{ 106{
110 int rc = 0; 107 int rc = 0;
108 struct cifsSesInfo *ses;
109 struct TCP_Server_Info *server;
110 struct nls_table *nls_codepage;
111 111
112 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 112 /*
113 check for tcp and smb session status done differently 113 * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
114 for those three - in the calling routine */ 114 * tcp and smb session status done differently for those three - in the
115 if (tcon) { 115 * calling routine
116 if (tcon->tidStatus == CifsExiting) { 116 */
117 /* only tree disconnect, open, and write, 117 if (!tcon)
118 (and ulogoff which does not have tcon) 118 return 0;
119 are allowed as we start force umount */ 119
120 if ((smb_command != SMB_COM_WRITE_ANDX) && 120 ses = tcon->ses;
121 (smb_command != SMB_COM_OPEN_ANDX) && 121 server = ses->server;
122 (smb_command != SMB_COM_TREE_DISCONNECT)) { 122
123 cFYI(1, ("can not send cmd %d while umounting", 123 /*
124 smb_command)); 124 * only tree disconnect, open, and write, (and ulogoff which does not
125 return -ENODEV; 125 * have tcon) are allowed as we start force umount
126 } 126 */
127 if (tcon->tidStatus == CifsExiting) {
128 if (smb_command != SMB_COM_WRITE_ANDX &&
129 smb_command != SMB_COM_OPEN_ANDX &&
130 smb_command != SMB_COM_TREE_DISCONNECT) {
131 cFYI(1, ("can not send cmd %d while umounting",
132 smb_command));
133 return -ENODEV;
127 } 134 }
128 if ((tcon->ses) && (tcon->ses->status != CifsExiting) && 135 }
129 (tcon->ses->server)) {
130 struct nls_table *nls_codepage;
131 /* Give Demultiplex thread up to 10 seconds to
132 reconnect, should be greater than cifs socket
133 timeout which is 7 seconds */
134 while (tcon->ses->server->tcpStatus ==
135 CifsNeedReconnect) {
136 wait_event_interruptible_timeout(tcon->ses->server->response_q,
137 (tcon->ses->server->tcpStatus ==
138 CifsGood), 10 * HZ);
139 if (tcon->ses->server->tcpStatus ==
140 CifsNeedReconnect) {
141 /* on "soft" mounts we wait once */
142 if (!tcon->retry ||
143 (tcon->ses->status == CifsExiting)) {
144 cFYI(1, ("gave up waiting on "
145 "reconnect in smb_init"));
146 return -EHOSTDOWN;
147 } /* else "hard" mount - keep retrying
148 until process is killed or server
149 comes back on-line */
150 } else /* TCP session is reestablished now */
151 break;
152 }
153 136
154 nls_codepage = load_nls_default(); 137 if (ses->status == CifsExiting)
155 /* need to prevent multiple threads trying to 138 return -EIO;
156 simultaneously reconnect the same SMB session */
157 down(&tcon->ses->sesSem);
158 if (tcon->ses->need_reconnect)
159 rc = cifs_setup_session(0, tcon->ses,
160 nls_codepage);
161 if (!rc && (tcon->need_reconnect)) {
162 mark_open_files_invalid(tcon);
163 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
164 tcon, nls_codepage);
165 up(&tcon->ses->sesSem);
166 /* BB FIXME add code to check if wsize needs
167 update due to negotiated smb buffer size
168 shrinking */
169 if (rc == 0) {
170 atomic_inc(&tconInfoReconnectCount);
171 /* tell server Unix caps we support */
172 if (tcon->ses->capabilities & CAP_UNIX)
173 reset_cifs_unix_caps(
174 0 /* no xid */,
175 tcon,
176 NULL /* we do not know sb */,
177 NULL /* no vol info */);
178 }
179 139
180 cFYI(1, ("reconnect tcon rc = %d", rc)); 140 /*
181 /* Removed call to reopen open files here. 141 * Give demultiplex thread up to 10 seconds to reconnect, should be
182 It is safer (and faster) to reopen files 142 * greater than cifs socket timeout which is 7 seconds
183 one at a time as needed in read and write */ 143 */
184 144 while (server->tcpStatus == CifsNeedReconnect) {
185 /* Check if handle based operation so we 145 wait_event_interruptible_timeout(server->response_q,
186 know whether we can continue or not without 146 (server->tcpStatus == CifsGood), 10 * HZ);
187 returning to caller to reset file handle */
188 switch (smb_command) {
189 case SMB_COM_READ_ANDX:
190 case SMB_COM_WRITE_ANDX:
191 case SMB_COM_CLOSE:
192 case SMB_COM_FIND_CLOSE2:
193 case SMB_COM_LOCKING_ANDX: {
194 unload_nls(nls_codepage);
195 return -EAGAIN;
196 }
197 }
198 } else {
199 up(&tcon->ses->sesSem);
200 }
201 unload_nls(nls_codepage);
202 147
203 } else { 148 /* is TCP session is reestablished now ?*/
204 return -EIO; 149 if (server->tcpStatus != CifsNeedReconnect)
150 break;
151
152 /*
153 * on "soft" mounts we wait once. Hard mounts keep
154 * retrying until process is killed or server comes
155 * back on-line
156 */
157 if (!tcon->retry || ses->status == CifsExiting) {
158 cFYI(1, ("gave up waiting on reconnect in smb_init"));
159 return -EHOSTDOWN;
205 } 160 }
206 } 161 }
162
163 if (!ses->need_reconnect && !tcon->need_reconnect)
164 return 0;
165
166 nls_codepage = load_nls_default();
167
168 /*
169 * need to prevent multiple threads trying to simultaneously
170 * reconnect the same SMB session
171 */
172 down(&ses->sesSem);
173 if (ses->need_reconnect)
174 rc = cifs_setup_session(0, ses, nls_codepage);
175
176 /* do we need to reconnect tcon? */
177 if (rc || !tcon->need_reconnect) {
178 up(&ses->sesSem);
179 goto out;
180 }
181
182 mark_open_files_invalid(tcon);
183 rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
184 up(&ses->sesSem);
185 cFYI(1, ("reconnect tcon rc = %d", rc));
186
187 if (rc)
188 goto out;
189
190 /*
191 * FIXME: check if wsize needs updated due to negotiated smb buffer
192 * size shrinking
193 */
194 atomic_inc(&tconInfoReconnectCount);
195
196 /* tell server Unix caps we support */
197 if (ses->capabilities & CAP_UNIX)
198 reset_cifs_unix_caps(0, tcon, NULL, NULL);
199
200 /*
201 * Removed call to reopen open files here. It is safer (and faster) to
202 * reopen files one at a time as needed in read and write.
203 *
204 * FIXME: what about file locks? don't we need to reclaim them ASAP?
205 */
206
207out:
208 /*
209 * Check if handle based operation so we know whether we can continue
210 * or not without returning to caller to reset file handle
211 */
212 switch (smb_command) {
213 case SMB_COM_READ_ANDX:
214 case SMB_COM_WRITE_ANDX:
215 case SMB_COM_CLOSE:
216 case SMB_COM_FIND_CLOSE2:
217 case SMB_COM_LOCKING_ANDX:
218 rc = -EAGAIN;
219 }
220
221 unload_nls(nls_codepage);
222 return rc;
223}
224
225/* Allocate and return pointer to an SMB request buffer, and set basic
226 SMB information in the SMB header. If the return code is zero, this
227 function must have filled in request_buf pointer */
228static int
229small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
230 void **request_buf)
231{
232 int rc = 0;
233
234 rc = cifs_reconnect_tcon(tcon, smb_command);
207 if (rc) 235 if (rc)
208 return rc; 236 return rc;
209 237
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
256{ 284{
257 int rc = 0; 285 int rc = 0;
258 286
259 /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so 287 rc = cifs_reconnect_tcon(tcon, smb_command);
260 check for tcp and smb session status done differently
261 for those three - in the calling routine */
262 if (tcon) {
263 if (tcon->tidStatus == CifsExiting) {
264 /* only tree disconnect, open, and write,
265 (and ulogoff which does not have tcon)
266 are allowed as we start force umount */
267 if ((smb_command != SMB_COM_WRITE_ANDX) &&
268 (smb_command != SMB_COM_OPEN_ANDX) &&
269 (smb_command != SMB_COM_TREE_DISCONNECT)) {
270 cFYI(1, ("can not send cmd %d while umounting",
271 smb_command));
272 return -ENODEV;
273 }
274 }
275
276 if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
277 (tcon->ses->server)) {
278 struct nls_table *nls_codepage;
279 /* Give Demultiplex thread up to 10 seconds to
280 reconnect, should be greater than cifs socket
281 timeout which is 7 seconds */
282 while (tcon->ses->server->tcpStatus ==
283 CifsNeedReconnect) {
284 wait_event_interruptible_timeout(tcon->ses->server->response_q,
285 (tcon->ses->server->tcpStatus ==
286 CifsGood), 10 * HZ);
287 if (tcon->ses->server->tcpStatus ==
288 CifsNeedReconnect) {
289 /* on "soft" mounts we wait once */
290 if (!tcon->retry ||
291 (tcon->ses->status == CifsExiting)) {
292 cFYI(1, ("gave up waiting on "
293 "reconnect in smb_init"));
294 return -EHOSTDOWN;
295 } /* else "hard" mount - keep retrying
296 until process is killed or server
297 comes on-line */
298 } else /* TCP session is reestablished now */
299 break;
300 }
301 nls_codepage = load_nls_default();
302 /* need to prevent multiple threads trying to
303 simultaneously reconnect the same SMB session */
304 down(&tcon->ses->sesSem);
305 if (tcon->ses->need_reconnect)
306 rc = cifs_setup_session(0, tcon->ses,
307 nls_codepage);
308 if (!rc && (tcon->need_reconnect)) {
309 mark_open_files_invalid(tcon);
310 rc = CIFSTCon(0, tcon->ses, tcon->treeName,
311 tcon, nls_codepage);
312 up(&tcon->ses->sesSem);
313 /* BB FIXME add code to check if wsize needs
314 update due to negotiated smb buffer size
315 shrinking */
316 if (rc == 0) {
317 atomic_inc(&tconInfoReconnectCount);
318 /* tell server Unix caps we support */
319 if (tcon->ses->capabilities & CAP_UNIX)
320 reset_cifs_unix_caps(
321 0 /* no xid */,
322 tcon,
323 NULL /* do not know sb */,
324 NULL /* no vol info */);
325 }
326
327 cFYI(1, ("reconnect tcon rc = %d", rc));
328 /* Removed call to reopen open files here.
329 It is safer (and faster) to reopen files
330 one at a time as needed in read and write */
331
332 /* Check if handle based operation so we
333 know whether we can continue or not without
334 returning to caller to reset file handle */
335 switch (smb_command) {
336 case SMB_COM_READ_ANDX:
337 case SMB_COM_WRITE_ANDX:
338 case SMB_COM_CLOSE:
339 case SMB_COM_FIND_CLOSE2:
340 case SMB_COM_LOCKING_ANDX: {
341 unload_nls(nls_codepage);
342 return -EAGAIN;
343 }
344 }
345 } else {
346 up(&tcon->ses->sesSem);
347 }
348 unload_nls(nls_codepage);
349
350 } else {
351 return -EIO;
352 }
353 }
354 if (rc) 288 if (rc)
355 return rc; 289 return rc;
356 290
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
3961 if (is_unicode) { 3895 if (is_unicode) {
3962 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, 3896 __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
3963 GFP_KERNEL); 3897 GFP_KERNEL);
3898 if (tmp == NULL) {
3899 rc = -ENOMEM;
3900 goto parse_DFS_referrals_exit;
3901 }
3964 cifsConvertToUCS((__le16 *) tmp, searchName, 3902 cifsConvertToUCS((__le16 *) tmp, searchName,
3965 PATH_MAX, nls_codepage, remap); 3903 PATH_MAX, nls_codepage, remap);
3966 node->path_consumed = cifs_ucs2_bytes(tmp, 3904 node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 1f3345d7fa79..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1377,7 +1377,7 @@ cifs_parse_mount_options(char *options, const char *devname,
1377} 1377}
1378 1378
1379static struct TCP_Server_Info * 1379static struct TCP_Server_Info *
1380cifs_find_tcp_session(struct sockaddr_storage *addr) 1380cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
1381{ 1381{
1382 struct list_head *tmp; 1382 struct list_head *tmp;
1383 struct TCP_Server_Info *server; 1383 struct TCP_Server_Info *server;
@@ -1397,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
1397 if (server->tcpStatus == CifsNew) 1397 if (server->tcpStatus == CifsNew)
1398 continue; 1398 continue;
1399 1399
1400 if (addr->ss_family == AF_INET && 1400 switch (addr->ss_family) {
1401 (addr4->sin_addr.s_addr != 1401 case AF_INET:
1402 server->addr.sockAddr.sin_addr.s_addr)) 1402 if (addr4->sin_addr.s_addr ==
1403 continue; 1403 server->addr.sockAddr.sin_addr.s_addr) {
1404 else if (addr->ss_family == AF_INET6 && 1404 addr4->sin_port = htons(port);
1405 (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, 1405 /* user overrode default port? */
1406 &addr6->sin6_addr) || 1406 if (addr4->sin_port) {
1407 server->addr.sockAddr6.sin6_scope_id != 1407 if (addr4->sin_port !=
1408 addr6->sin6_scope_id)) 1408 server->addr.sockAddr.sin_port)
1409 continue; 1409 continue;
1410 }
1411 break;
1412 } else
1413 continue;
1414
1415 case AF_INET6:
1416 if (ipv6_addr_equal(&addr6->sin6_addr,
1417 &server->addr.sockAddr6.sin6_addr) &&
1418 (addr6->sin6_scope_id ==
1419 server->addr.sockAddr6.sin6_scope_id)) {
1420 addr6->sin6_port = htons(port);
1421 /* user overrode default port? */
1422 if (addr6->sin6_port) {
1423 if (addr6->sin6_port !=
1424 server->addr.sockAddr6.sin6_port)
1425 continue;
1426 }
1427 break;
1428 } else
1429 continue;
1430 }
1410 1431
1411 ++server->srv_count; 1432 ++server->srv_count;
1412 write_unlock(&cifs_tcp_ses_lock); 1433 write_unlock(&cifs_tcp_ses_lock);
@@ -1475,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1475 } 1496 }
1476 1497
1477 /* see if we already have a matching tcp_ses */ 1498 /* see if we already have a matching tcp_ses */
1478 tcp_ses = cifs_find_tcp_session(&addr); 1499 tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
1479 if (tcp_ses) 1500 if (tcp_ses)
1480 return tcp_ses; 1501 return tcp_ses;
1481 1502
@@ -2636,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
2636 return -EIO; 2657 return -EIO;
2637 2658
2638 smb_buffer = cifs_buf_get(); 2659 smb_buffer = cifs_buf_get();
2639 if (smb_buffer == NULL) { 2660 if (smb_buffer == NULL)
2640 return -ENOMEM; 2661 return -ENOMEM;
2641 } 2662
2642 smb_buffer_response = smb_buffer; 2663 smb_buffer_response = smb_buffer;
2643 2664
2644 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, 2665 header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
153 mutex_init(&pCifsFile->fh_mutex); 153 mutex_init(&pCifsFile->fh_mutex);
154 mutex_init(&pCifsFile->lock_mutex); 154 mutex_init(&pCifsFile->lock_mutex);
155 INIT_LIST_HEAD(&pCifsFile->llist); 155 INIT_LIST_HEAD(&pCifsFile->llist);
156 atomic_set(&pCifsFile->wrtPending, 0); 156 atomic_set(&pCifsFile->count, 1);
157 157
158 /* set the following in open now 158 /* set the following in open now
159 pCifsFile->pfile = file; */ 159 pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
53 private_data->pInode = inode; 53 private_data->pInode = inode;
54 private_data->invalidHandle = false; 54 private_data->invalidHandle = false;
55 private_data->closePend = false; 55 private_data->closePend = false;
56 /* we have to track num writers to the inode, since writepages 56 /* Initialize reference count to one. The private data is
57 does not tell us which handle the write is for so there can 57 freed on the release of the last reference */
58 be a close (overlapping with write) of the filehandle that 58 atomic_set(&private_data->count, 1);
59 cifs_writepages chose to use */
60 atomic_set(&private_data->wrtPending, 0);
61 59
62 return private_data; 60 return private_data;
63} 61}
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
643 if (!pTcon->need_reconnect) { 641 if (!pTcon->need_reconnect) {
644 write_unlock(&GlobalSMBSeslock); 642 write_unlock(&GlobalSMBSeslock);
645 timeout = 2; 643 timeout = 2;
646 while ((atomic_read(&pSMBFile->wrtPending) != 0) 644 while ((atomic_read(&pSMBFile->count) != 1)
647 && (timeout <= 2048)) { 645 && (timeout <= 2048)) {
648 /* Give write a better chance to get to 646 /* Give write a better chance to get to
649 server ahead of the close. We do not 647 server ahead of the close. We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
657 msleep(timeout); 655 msleep(timeout);
658 timeout *= 4; 656 timeout *= 4;
659 } 657 }
660 if (atomic_read(&pSMBFile->wrtPending))
661 cERROR(1, ("close with pending write"));
662 if (!pTcon->need_reconnect && 658 if (!pTcon->need_reconnect &&
663 !pSMBFile->invalidHandle) 659 !pSMBFile->invalidHandle)
664 rc = CIFSSMBClose(xid, pTcon, 660 rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
681 list_del(&pSMBFile->flist); 677 list_del(&pSMBFile->flist);
682 list_del(&pSMBFile->tlist); 678 list_del(&pSMBFile->tlist);
683 write_unlock(&GlobalSMBSeslock); 679 write_unlock(&GlobalSMBSeslock);
684 timeout = 10; 680 cifsFileInfo_put(file->private_data);
685 /* We waited above to give the SMBWrite a chance to issue
686 on the wire (so we do not get SMBWrite returning EBADF
687 if writepages is racing with close. Note that writepages
688 does not specify a file handle, so it is possible for a file
689 to be opened twice, and the application close the "wrong"
690 file handle - in these cases we delay long enough to allow
691 the SMBWrite to get on the wire before the SMB Close.
692 We allow total wait here over 45 seconds, more than
693 oplock break time, and more than enough to allow any write
694 to complete on the server, or to time out on the client */
695 while ((atomic_read(&pSMBFile->wrtPending) != 0)
696 && (timeout <= 50000)) {
697 cERROR(1, ("writes pending, delay free of handle"));
698 msleep(timeout);
699 timeout *= 8;
700 }
701 kfree(file->private_data);
702 file->private_data = NULL; 681 file->private_data = NULL;
703 } else 682 } else
704 rc = -EBADF; 683 rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
1236 if (!open_file->invalidHandle) { 1215 if (!open_file->invalidHandle) {
1237 /* found a good file */ 1216 /* found a good file */
1238 /* lock it so it will not be closed on us */ 1217 /* lock it so it will not be closed on us */
1239 atomic_inc(&open_file->wrtPending); 1218 cifsFileInfo_get(open_file);
1240 read_unlock(&GlobalSMBSeslock); 1219 read_unlock(&GlobalSMBSeslock);
1241 return open_file; 1220 return open_file;
1242 } /* else might as well continue, and look for 1221 } /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
1276 if (open_file->pfile && 1255 if (open_file->pfile &&
1277 ((open_file->pfile->f_flags & O_RDWR) || 1256 ((open_file->pfile->f_flags & O_RDWR) ||
1278 (open_file->pfile->f_flags & O_WRONLY))) { 1257 (open_file->pfile->f_flags & O_WRONLY))) {
1279 atomic_inc(&open_file->wrtPending); 1258 cifsFileInfo_get(open_file);
1280 1259
1281 if (!open_file->invalidHandle) { 1260 if (!open_file->invalidHandle) {
1282 /* found a good writable file */ 1261 /* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
1293 else { /* start over in case this was deleted */ 1272 else { /* start over in case this was deleted */
1294 /* since the list could be modified */ 1273 /* since the list could be modified */
1295 read_lock(&GlobalSMBSeslock); 1274 read_lock(&GlobalSMBSeslock);
1296 atomic_dec(&open_file->wrtPending); 1275 cifsFileInfo_put(open_file);
1297 goto refind_writable; 1276 goto refind_writable;
1298 } 1277 }
1299 } 1278 }
@@ -1309,7 +1288,7 @@ refind_writable:
1309 read_lock(&GlobalSMBSeslock); 1288 read_lock(&GlobalSMBSeslock);
1310 /* can not use this handle, no write 1289 /* can not use this handle, no write
1311 pending on this one after all */ 1290 pending on this one after all */
1312 atomic_dec(&open_file->wrtPending); 1291 cifsFileInfo_put(open_file);
1313 1292
1314 if (open_file->closePend) /* list could have changed */ 1293 if (open_file->closePend) /* list could have changed */
1315 goto refind_writable; 1294 goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1373 if (open_file) { 1352 if (open_file) {
1374 bytes_written = cifs_write(open_file->pfile, write_data, 1353 bytes_written = cifs_write(open_file->pfile, write_data,
1375 to-from, &offset); 1354 to-from, &offset);
1376 atomic_dec(&open_file->wrtPending); 1355 cifsFileInfo_put(open_file);
1377 /* Does mm or vfs already set times? */ 1356 /* Does mm or vfs already set times? */
1378 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); 1357 inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
1379 if ((bytes_written > 0) && (offset)) 1358 if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
1562 bytes_to_write, offset, 1541 bytes_to_write, offset,
1563 &bytes_written, iov, n_iov, 1542 &bytes_written, iov, n_iov,
1564 long_op); 1543 long_op);
1565 atomic_dec(&open_file->wrtPending); 1544 cifsFileInfo_put(open_file);
1566 cifs_update_eof(cifsi, offset, bytes_written); 1545 cifs_update_eof(cifsi, offset, bytes_written);
1567 1546
1568 if (rc || bytes_written < bytes_to_write) { 1547 if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 82d83839655e..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -800,7 +800,7 @@ set_via_filehandle:
800 if (open_file == NULL) 800 if (open_file == NULL)
801 CIFSSMBClose(xid, pTcon, netfid); 801 CIFSSMBClose(xid, pTcon, netfid);
802 else 802 else
803 atomic_dec(&open_file->wrtPending); 803 cifsFileInfo_put(open_file);
804out: 804out:
805 return rc; 805 return rc;
806} 806}
@@ -1635,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
1635 __u32 npid = open_file->pid; 1635 __u32 npid = open_file->pid;
1636 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, 1636 rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
1637 npid, false); 1637 npid, false);
1638 atomic_dec(&open_file->wrtPending); 1638 cifsFileInfo_put(open_file);
1639 cFYI(1, ("SetFSize for attrs rc = %d", rc)); 1639 cFYI(1, ("SetFSize for attrs rc = %d", rc));
1640 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { 1640 if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
1641 unsigned int bytes_written; 1641 unsigned int bytes_written;
@@ -1790,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
1790 u16 nfid = open_file->netfid; 1790 u16 nfid = open_file->netfid;
1791 u32 npid = open_file->pid; 1791 u32 npid = open_file->pid;
1792 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); 1792 rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
1793 atomic_dec(&open_file->wrtPending); 1793 cifsFileInfo_put(open_file);
1794 } else { 1794 } else {
1795 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, 1795 rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
1796 cifs_sb->local_nls, 1796 cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
119 temp->pinode = pinode; 119 temp->pinode = pinode;
120 temp->tcon = tcon; 120 temp->tcon = tcon;
121 temp->netfid = fid; 121 temp->netfid = fid;
122 spin_lock(&GlobalMid_Lock); 122 spin_lock(&cifs_oplock_lock);
123 list_add_tail(&temp->qhead, &GlobalOplock_Q); 123 list_add_tail(&temp->qhead, &cifs_oplock_list);
124 spin_unlock(&GlobalMid_Lock); 124 spin_unlock(&cifs_oplock_lock);
125 } 125 }
126 return temp; 126 return temp;
127
128} 127}
129 128
130void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) 129void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
131{ 130{
132 spin_lock(&GlobalMid_Lock); 131 spin_lock(&cifs_oplock_lock);
133 /* should we check if list empty first? */ 132 /* should we check if list empty first? */
134 list_del(&oplockEntry->qhead); 133 list_del(&oplockEntry->qhead);
135 spin_unlock(&GlobalMid_Lock); 134 spin_unlock(&cifs_oplock_lock);
136 kmem_cache_free(cifs_oplock_cachep, oplockEntry); 135 kmem_cache_free(cifs_oplock_cachep, oplockEntry);
137} 136}
138 137
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
144 if (tcon == NULL) 143 if (tcon == NULL)
145 return; 144 return;
146 145
147 spin_lock(&GlobalMid_Lock); 146 spin_lock(&cifs_oplock_lock);
148 list_for_each_entry(temp, &GlobalOplock_Q, qhead) { 147 list_for_each_entry(temp, &cifs_oplock_list, qhead) {
149 if ((temp->tcon) && (temp->tcon == tcon)) { 148 if ((temp->tcon) && (temp->tcon == tcon)) {
150 list_del(&temp->qhead); 149 list_del(&temp->qhead);
151 kmem_cache_free(cifs_oplock_cachep, temp); 150 kmem_cache_free(cifs_oplock_cachep, temp);
152 } 151 }
153 } 152 }
154 spin_unlock(&GlobalMid_Lock); 153 spin_unlock(&cifs_oplock_lock);
155} 154}
156 155
157static int 156static int
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
1485 if (!bprm) 1485 if (!bprm)
1486 goto out_files; 1486 goto out_files;
1487 1487
1488 retval = -ERESTARTNOINTR; 1488 retval = prepare_bprm_creds(bprm);
1489 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1489 if (retval)
1490 goto out_free; 1490 goto out_free;
1491 current->in_execve = 1;
1492
1493 retval = -ENOMEM;
1494 bprm->cred = prepare_exec_creds();
1495 if (!bprm->cred)
1496 goto out_unlock;
1497 1491
1498 retval = check_unsafe_exec(bprm); 1492 retval = check_unsafe_exec(bprm);
1499 if (retval < 0) 1493 if (retval < 0)
1500 goto out_unlock; 1494 goto out_free;
1501 clear_in_exec = retval; 1495 clear_in_exec = retval;
1496 current->in_execve = 1;
1502 1497
1503 file = open_exec(filename); 1498 file = open_exec(filename);
1504 retval = PTR_ERR(file); 1499 retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
1547 /* execve succeeded */ 1542 /* execve succeeded */
1548 current->fs->in_exec = 0; 1543 current->fs->in_exec = 0;
1549 current->in_execve = 0; 1544 current->in_execve = 0;
1550 mutex_unlock(&current->cred_guard_mutex);
1551 acct_update_integrals(current); 1545 acct_update_integrals(current);
1552 free_bprm(bprm); 1546 free_bprm(bprm);
1553 if (displaced) 1547 if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
1567out_unmark: 1561out_unmark:
1568 if (clear_in_exec) 1562 if (clear_in_exec)
1569 current->fs->in_exec = 0; 1563 current->fs->in_exec = 0;
1570
1571out_unlock:
1572 current->in_execve = 0; 1564 current->in_execve = 0;
1573 mutex_unlock(&current->cred_guard_mutex);
1574 1565
1575out_free: 1566out_free:
1576 free_bprm(bprm); 1567 free_bprm(bprm);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
51}; 51};
52 52
53static struct backing_dev_info configfs_backing_dev_info = { 53static struct backing_dev_info configfs_backing_dev_info = {
54 .name = "configfs",
54 .ra_pages = 0, /* No readahead */ 55 .ra_pages = 0, /* No readahead */
55 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 56 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
56}; 57};
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
32#include <linux/swap.h> 32#include <linux/swap.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/fs_struct.h> 34#include <linux/fs_struct.h>
35#include <linux/hardirq.h>
35#include "internal.h" 36#include "internal.h"
36 37
37int sysctl_vfs_cache_pressure __read_mostly = 100; 38int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
106#define CF_CONNECT_PENDING 3 106#define CF_CONNECT_PENDING 3
107#define CF_INIT_PENDING 4 107#define CF_INIT_PENDING 4
108#define CF_IS_OTHERCON 5 108#define CF_IS_OTHERCON 5
109#define CF_CLOSE 6
109 struct list_head writequeue; /* List of outgoing writequeue_entries */ 110 struct list_head writequeue; /* List of outgoing writequeue_entries */
110 spinlock_t writequeue_lock; 111 spinlock_t writequeue_lock;
111 int (*rx_action) (struct connection *); /* What to do when active */ 112 int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
299 300
300static inline void lowcomms_connect_sock(struct connection *con) 301static inline void lowcomms_connect_sock(struct connection *con)
301{ 302{
303 if (test_bit(CF_CLOSE, &con->flags))
304 return;
302 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags)) 305 if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
303 queue_work(send_workqueue, &con->swork); 306 queue_work(send_workqueue, &con->swork);
304} 307}
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
926 goto out_err; 929 goto out_err;
927 930
928 memset(&saddr, 0, sizeof(saddr)); 931 memset(&saddr, 0, sizeof(saddr));
929 if (dlm_nodeid_to_addr(con->nodeid, &saddr)) { 932 if (dlm_nodeid_to_addr(con->nodeid, &saddr))
930 sock_release(sock);
931 goto out_err; 933 goto out_err;
932 }
933 934
934 sock->sk->sk_user_data = con; 935 sock->sk->sk_user_data = con;
935 con->rx_action = receive_from_sock; 936 con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
1284static void send_to_sock(struct connection *con) 1285static void send_to_sock(struct connection *con)
1285{ 1286{
1286 int ret = 0; 1287 int ret = 0;
1287 ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; 1288 const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1289 struct writequeue_entry *e; 1289 struct writequeue_entry *e;
1290 int len, offset; 1290 int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
1293 if (con->sock == NULL) 1293 if (con->sock == NULL)
1294 goto out_connect; 1294 goto out_connect;
1295 1295
1296 sendpage = con->sock->ops->sendpage;
1297
1298 spin_lock(&con->writequeue_lock); 1296 spin_lock(&con->writequeue_lock);
1299 for (;;) { 1297 for (;;) {
1300 e = list_entry(con->writequeue.next, struct writequeue_entry, 1298 e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
1309 1307
1310 ret = 0; 1308 ret = 0;
1311 if (len) { 1309 if (len) {
1312 ret = sendpage(con->sock, e->page, offset, len, 1310 ret = kernel_sendpage(con->sock, e->page, offset, len,
1313 msg_flags); 1311 msg_flags);
1314 if (ret == -EAGAIN || ret == 0) { 1312 if (ret == -EAGAIN || ret == 0) {
1315 cond_resched(); 1313 cond_resched();
1316 goto out; 1314 goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
1370 log_print("closing connection to node %d", nodeid); 1368 log_print("closing connection to node %d", nodeid);
1371 con = nodeid2con(nodeid, 0); 1369 con = nodeid2con(nodeid, 0);
1372 if (con) { 1370 if (con) {
1371 clear_bit(CF_CONNECT_PENDING, &con->flags);
1372 clear_bit(CF_WRITE_PENDING, &con->flags);
1373 set_bit(CF_CLOSE, &con->flags);
1374 if (cancel_work_sync(&con->swork))
1375 log_print("canceled swork for node %d", nodeid);
1376 if (cancel_work_sync(&con->rwork))
1377 log_print("canceled rwork for node %d", nodeid);
1373 clean_one_writequeue(con); 1378 clean_one_writequeue(con);
1374 close_connection(con, true); 1379 close_connection(con, true);
1375 } 1380 }
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
1395 1400
1396 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { 1401 if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
1397 con->connect_action(con); 1402 con->connect_action(con);
1403 set_bit(CF_WRITE_PENDING, &con->flags);
1398 } 1404 }
1399 clear_bit(CF_WRITE_PENDING, &con->flags); 1405 if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
1400 send_to_sock(con); 1406 send_to_sock(con);
1401} 1407}
1402 1408
1403 1409
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
63 return rv; 63 return rv;
64 } 64 }
65 65
66 return genlmsg_unicast(skb, listener_nlpid); 66 return genlmsg_unicast(&init_net, skb, listener_nlpid);
67} 67}
68 68
69static int user_cmd(struct sk_buff *skb, struct genl_info *info) 69static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/exec.c b/fs/exec.c
index fb4f3cdda78c..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1016,6 +1016,35 @@ out:
1016EXPORT_SYMBOL(flush_old_exec); 1016EXPORT_SYMBOL(flush_old_exec);
1017 1017
1018/* 1018/*
1019 * Prepare credentials and lock ->cred_guard_mutex.
1020 * install_exec_creds() commits the new creds and drops the lock.
1021 * Or, if exec fails before, free_bprm() should release ->cred and
1022 * and unlock.
1023 */
1024int prepare_bprm_creds(struct linux_binprm *bprm)
1025{
1026 if (mutex_lock_interruptible(&current->cred_guard_mutex))
1027 return -ERESTARTNOINTR;
1028
1029 bprm->cred = prepare_exec_creds();
1030 if (likely(bprm->cred))
1031 return 0;
1032
1033 mutex_unlock(&current->cred_guard_mutex);
1034 return -ENOMEM;
1035}
1036
1037void free_bprm(struct linux_binprm *bprm)
1038{
1039 free_arg_pages(bprm);
1040 if (bprm->cred) {
1041 mutex_unlock(&current->cred_guard_mutex);
1042 abort_creds(bprm->cred);
1043 }
1044 kfree(bprm);
1045}
1046
1047/*
1019 * install the new credentials for this executable 1048 * install the new credentials for this executable
1020 */ 1049 */
1021void install_exec_creds(struct linux_binprm *bprm) 1050void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
1024 1053
1025 commit_creds(bprm->cred); 1054 commit_creds(bprm->cred);
1026 bprm->cred = NULL; 1055 bprm->cred = NULL;
1027 1056 /*
1028 /* cred_guard_mutex must be held at least to this point to prevent 1057 * cred_guard_mutex must be held at least to this point to prevent
1029 * ptrace_attach() from altering our determination of the task's 1058 * ptrace_attach() from altering our determination of the task's
1030 * credentials; any time after this it may be unlocked */ 1059 * credentials; any time after this it may be unlocked.
1031 1060 */
1032 security_bprm_committed_creds(bprm); 1061 security_bprm_committed_creds(bprm);
1062 mutex_unlock(&current->cred_guard_mutex);
1033} 1063}
1034EXPORT_SYMBOL(install_exec_creds); 1064EXPORT_SYMBOL(install_exec_creds);
1035 1065
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
1246 1276
1247EXPORT_SYMBOL(search_binary_handler); 1277EXPORT_SYMBOL(search_binary_handler);
1248 1278
1249void free_bprm(struct linux_binprm *bprm)
1250{
1251 free_arg_pages(bprm);
1252 if (bprm->cred)
1253 abort_creds(bprm->cred);
1254 kfree(bprm);
1255}
1256
1257/* 1279/*
1258 * sys_execve() executes a new program. 1280 * sys_execve() executes a new program.
1259 */ 1281 */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
1277 if (!bprm) 1299 if (!bprm)
1278 goto out_files; 1300 goto out_files;
1279 1301
1280 retval = -ERESTARTNOINTR; 1302 retval = prepare_bprm_creds(bprm);
1281 if (mutex_lock_interruptible(&current->cred_guard_mutex)) 1303 if (retval)
1282 goto out_free; 1304 goto out_free;
1283 current->in_execve = 1;
1284
1285 retval = -ENOMEM;
1286 bprm->cred = prepare_exec_creds();
1287 if (!bprm->cred)
1288 goto out_unlock;
1289 1305
1290 retval = check_unsafe_exec(bprm); 1306 retval = check_unsafe_exec(bprm);
1291 if (retval < 0) 1307 if (retval < 0)
1292 goto out_unlock; 1308 goto out_free;
1293 clear_in_exec = retval; 1309 clear_in_exec = retval;
1310 current->in_execve = 1;
1294 1311
1295 file = open_exec(filename); 1312 file = open_exec(filename);
1296 retval = PTR_ERR(file); 1313 retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
1340 /* execve succeeded */ 1357 /* execve succeeded */
1341 current->fs->in_exec = 0; 1358 current->fs->in_exec = 0;
1342 current->in_execve = 0; 1359 current->in_execve = 0;
1343 mutex_unlock(&current->cred_guard_mutex);
1344 acct_update_integrals(current); 1360 acct_update_integrals(current);
1345 free_bprm(bprm); 1361 free_bprm(bprm);
1346 if (displaced) 1362 if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
1360out_unmark: 1376out_unmark:
1361 if (clear_in_exec) 1377 if (clear_in_exec)
1362 current->fs->in_exec = 0; 1378 current->fs->in_exec = 0;
1363
1364out_unlock:
1365 current->in_execve = 0; 1379 current->in_execve = 0;
1366 mutex_unlock(&current->cred_guard_mutex);
1367 1380
1368out_free: 1381out_free:
1369 free_bprm(bprm); 1382 free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
230 return error; 230 return error;
231} 231}
232 232
233static int 233int
234ext2_check_acl(struct inode *inode, int mask) 234ext2_check_acl(struct inode *inode, int mask)
235{ 235{
236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); 236 struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
246 return -EAGAIN; 246 return -EAGAIN;
247} 247}
248 248
249int
250ext2_permission(struct inode *inode, int mask)
251{
252 return generic_permission(inode, mask, ext2_check_acl);
253}
254
255/* 249/*
256 * Initialize the ACLs of a new inode. Called from ext2_new_inode. 250 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
257 * 251 *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
54#ifdef CONFIG_EXT2_FS_POSIX_ACL 54#ifdef CONFIG_EXT2_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext2_permission (struct inode *, int); 57extern int ext2_check_acl (struct inode *, int);
58extern int ext2_acl_chmod (struct inode *); 58extern int ext2_acl_chmod (struct inode *);
59extern int ext2_init_acl (struct inode *, struct inode *); 59extern int ext2_init_acl (struct inode *, struct inode *);
60 60
61#else 61#else
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext2_permission NULL 63#define ext2_check_acl NULL
64#define ext2_get_acl NULL 64#define ext2_get_acl NULL
65#define ext2_set_acl NULL 65#define ext2_set_acl NULL
66 66
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
85 .removexattr = generic_removexattr, 85 .removexattr = generic_removexattr,
86#endif 86#endif
87 .setattr = ext2_setattr, 87 .setattr = ext2_setattr,
88 .permission = ext2_permission, 88 .check_acl = ext2_check_acl,
89 .fiemap = ext2_fiemap, 89 .fiemap = ext2_fiemap,
90}; 90};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
482 unlock_buffer(bh); 482 unlock_buffer(bh);
483 mark_buffer_dirty_inode(bh, inode); 483 mark_buffer_dirty_inode(bh, inode);
484 /* We used to sync bh here if IS_SYNC(inode). 484 /* We used to sync bh here if IS_SYNC(inode).
485 * But we now rely upon generic_osync_inode() 485 * But we now rely upon generic_write_sync()
486 * and b_inode_buffers. But not for directories. 486 * and b_inode_buffers. But not for directories.
487 */ 487 */
488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 488 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
362 if (dir_de) { 362 if (dir_de) {
363 if (old_dir != new_dir) 363 if (old_dir != new_dir)
364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); 364 ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
365 else {
366 kunmap(dir_page);
367 page_cache_release(dir_page);
368 }
365 inode_dec_link_count(old_dir); 369 inode_dec_link_count(old_dir);
366 } 370 }
367 return 0; 371 return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
396 .removexattr = generic_removexattr, 400 .removexattr = generic_removexattr,
397#endif 401#endif
398 .setattr = ext2_setattr, 402 .setattr = ext2_setattr,
399 .permission = ext2_permission, 403 .check_acl = ext2_check_acl,
400}; 404};
401 405
402const struct inode_operations ext2_special_inode_operations = { 406const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
407 .removexattr = generic_removexattr, 411 .removexattr = generic_removexattr,
408#endif 412#endif
409 .setattr = ext2_setattr, 413 .setattr = ext2_setattr,
410 .permission = ext2_permission, 414 .check_acl = ext2_check_acl,
411}; 415};
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
238 return error; 238 return error;
239} 239}
240 240
241static int 241int
242ext3_check_acl(struct inode *inode, int mask) 242ext3_check_acl(struct inode *inode, int mask)
243{ 243{
244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); 244 struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
254 return -EAGAIN; 254 return -EAGAIN;
255} 255}
256 256
257int
258ext3_permission(struct inode *inode, int mask)
259{
260 return generic_permission(inode, mask, ext3_check_acl);
261}
262
263/* 257/*
264 * Initialize the ACLs of a new inode. Called from ext3_new_inode. 258 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
265 * 259 *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
54#ifdef CONFIG_EXT3_FS_POSIX_ACL 54#ifdef CONFIG_EXT3_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext3_permission (struct inode *, int); 57extern int ext3_check_acl (struct inode *, int);
58extern int ext3_acl_chmod (struct inode *); 58extern int ext3_acl_chmod (struct inode *);
59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); 59extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT3_FS_POSIX_ACL */ 61#else /* CONFIG_EXT3_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext3_permission NULL 63#define ext3_check_acl NULL
64 64
65static inline int 65static inline int
66ext3_acl_chmod(struct inode *inode) 66ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
51 return 0; 51 return 0;
52} 52}
53 53
54static ssize_t
55ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
56 unsigned long nr_segs, loff_t pos)
57{
58 struct file *file = iocb->ki_filp;
59 struct inode *inode = file->f_path.dentry->d_inode;
60 ssize_t ret;
61 int err;
62
63 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
64
65 /*
66 * Skip flushing if there was an error, or if nothing was written.
67 */
68 if (ret <= 0)
69 return ret;
70
71 /*
72 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
73 * journalling then we need to make sure that we force the transaction
74 * to disk to keep all metadata uptodate synchronously.
75 */
76 if (file->f_flags & O_SYNC) {
77 /*
78 * If we are non-data-journaled, then the dirty data has
79 * already been flushed to backing store by generic_osync_inode,
80 * and the inode has been flushed too if there have been any
81 * modifications other than mere timestamp updates.
82 *
83 * Open question --- do we care about flushing timestamps too
84 * if the inode is IS_SYNC?
85 */
86 if (!ext3_should_journal_data(inode))
87 return ret;
88
89 goto force_commit;
90 }
91
92 /*
93 * So we know that there has been no forced data flush. If the inode
94 * is marked IS_SYNC, we need to force one ourselves.
95 */
96 if (!IS_SYNC(inode))
97 return ret;
98
99 /*
100 * Open question #2 --- should we force data to disk here too? If we
101 * don't, the only impact is that data=writeback filesystems won't
102 * flush data to disk automatically on IS_SYNC, only metadata (but
103 * historically, that is what ext2 has done.)
104 */
105
106force_commit:
107 err = ext3_force_commit(inode->i_sb);
108 if (err)
109 return err;
110 return ret;
111}
112
113const struct file_operations ext3_file_operations = { 54const struct file_operations ext3_file_operations = {
114 .llseek = generic_file_llseek, 55 .llseek = generic_file_llseek,
115 .read = do_sync_read, 56 .read = do_sync_read,
116 .write = do_sync_write, 57 .write = do_sync_write,
117 .aio_read = generic_file_aio_read, 58 .aio_read = generic_file_aio_read,
118 .aio_write = ext3_file_write, 59 .aio_write = generic_file_aio_write,
119 .unlocked_ioctl = ext3_ioctl, 60 .unlocked_ioctl = ext3_ioctl,
120#ifdef CONFIG_COMPAT 61#ifdef CONFIG_COMPAT
121 .compat_ioctl = ext3_compat_ioctl, 62 .compat_ioctl = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
137 .listxattr = ext3_listxattr, 78 .listxattr = ext3_listxattr,
138 .removexattr = generic_removexattr, 79 .removexattr = generic_removexattr,
139#endif 80#endif
140 .permission = ext3_permission, 81 .check_acl = ext3_check_acl,
141 .fiemap = ext3_fiemap, 82 .fiemap = ext3_fiemap,
142}; 83};
143 84
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/time.h> 25#include <linux/time.h>
26#include <linux/blkdev.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27#include <linux/sched.h> 28#include <linux/sched.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
73 } 74 }
74 75
75 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 76 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
76 goto out; 77 goto flush;
77 78
78 /* 79 /*
79 * The VFS has written the file data. If the inode is unaltered 80 * The VFS has written the file data. If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
85 .nr_to_write = 0, /* sys_fsync did this */ 86 .nr_to_write = 0, /* sys_fsync did this */
86 }; 87 };
87 ret = sync_inode(inode, &wbc); 88 ret = sync_inode(inode, &wbc);
89 goto out;
88 } 90 }
91flush:
92 /*
93 * In case we didn't commit a transaction, we have to flush
94 * disk caches manually so that data really is on persistent
95 * storage
96 */
97 if (test_opt(inode->i_sb, BARRIER))
98 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
89out: 99out:
90 return ret; 100 return ret;
91} 101}
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b49908a167ae..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
172 * so before we call here everything must be consistently dirtied against 172 * so before we call here everything must be consistently dirtied against
173 * this transaction. 173 * this transaction.
174 */ 174 */
175static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) 175static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
176{ 176{
177 int ret;
178
177 jbd_debug(2, "restarting handle %p\n", handle); 179 jbd_debug(2, "restarting handle %p\n", handle);
178 return ext3_journal_restart(handle, blocks_for_truncate(inode)); 180 /*
181 * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
182 * At this moment, get_block can be called only for blocks inside
183 * i_size since page cache has been already dropped and writes are
184 * blocked by i_mutex. So we can safely drop the truncate_mutex.
185 */
186 mutex_unlock(&EXT3_I(inode)->truncate_mutex);
187 ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
188 mutex_lock(&EXT3_I(inode)->truncate_mutex);
189 return ret;
179} 190}
180 191
181/* 192/*
@@ -2072,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2072 ext3_journal_dirty_metadata(handle, bh); 2083 ext3_journal_dirty_metadata(handle, bh);
2073 } 2084 }
2074 ext3_mark_inode_dirty(handle, inode); 2085 ext3_mark_inode_dirty(handle, inode);
2075 ext3_journal_test_restart(handle, inode); 2086 truncate_restart_transaction(handle, inode);
2076 if (bh) { 2087 if (bh) {
2077 BUFFER_TRACE(bh, "retaking write access"); 2088 BUFFER_TRACE(bh, "retaking write access");
2078 ext3_journal_get_write_access(handle, bh); 2089 ext3_journal_get_write_access(handle, bh);
@@ -2282,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
2282 return; 2293 return;
2283 if (try_to_extend_transaction(handle, inode)) { 2294 if (try_to_extend_transaction(handle, inode)) {
2284 ext3_mark_inode_dirty(handle, inode); 2295 ext3_mark_inode_dirty(handle, inode);
2285 ext3_journal_test_restart(handle, inode); 2296 truncate_restart_transaction(handle, inode);
2286 } 2297 }
2287 2298
2288 ext3_free_blocks(handle, inode, nr, 1); 2299 ext3_free_blocks(handle, inode, nr, 1);
@@ -2892,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
2892 struct buffer_head *bh = iloc->bh; 2903 struct buffer_head *bh = iloc->bh;
2893 int err = 0, rc, block; 2904 int err = 0, rc, block;
2894 2905
2906again:
2907 /* we can't allow multiple procs in here at once, its a bit racey */
2908 lock_buffer(bh);
2909
2895 /* For fields not not tracking in the in-memory inode, 2910 /* For fields not not tracking in the in-memory inode,
2896 * initialise them to zero for new inodes. */ 2911 * initialise them to zero for new inodes. */
2897 if (ei->i_state & EXT3_STATE_NEW) 2912 if (ei->i_state & EXT3_STATE_NEW)
@@ -2951,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
2951 /* If this is the first large file 2966 /* If this is the first large file
2952 * created, add a flag to the superblock. 2967 * created, add a flag to the superblock.
2953 */ 2968 */
2969 unlock_buffer(bh);
2954 err = ext3_journal_get_write_access(handle, 2970 err = ext3_journal_get_write_access(handle,
2955 EXT3_SB(sb)->s_sbh); 2971 EXT3_SB(sb)->s_sbh);
2956 if (err) 2972 if (err)
2957 goto out_brelse; 2973 goto out_brelse;
2974
2958 ext3_update_dynamic_rev(sb); 2975 ext3_update_dynamic_rev(sb);
2959 EXT3_SET_RO_COMPAT_FEATURE(sb, 2976 EXT3_SET_RO_COMPAT_FEATURE(sb,
2960 EXT3_FEATURE_RO_COMPAT_LARGE_FILE); 2977 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2961 handle->h_sync = 1; 2978 handle->h_sync = 1;
2962 err = ext3_journal_dirty_metadata(handle, 2979 err = ext3_journal_dirty_metadata(handle,
2963 EXT3_SB(sb)->s_sbh); 2980 EXT3_SB(sb)->s_sbh);
2981 /* get our lock and start over */
2982 goto again;
2964 } 2983 }
2965 } 2984 }
2966 } 2985 }
@@ -2983,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
2983 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 3002 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2984 3003
2985 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); 3004 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
3005 unlock_buffer(bh);
2986 rc = ext3_journal_dirty_metadata(handle, bh); 3006 rc = ext3_journal_dirty_metadata(handle, bh);
2987 if (!err) 3007 if (!err)
2988 err = rc; 3008 err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
2445 .listxattr = ext3_listxattr, 2445 .listxattr = ext3_listxattr,
2446 .removexattr = generic_removexattr, 2446 .removexattr = generic_removexattr,
2447#endif 2447#endif
2448 .permission = ext3_permission, 2448 .check_acl = ext3_check_acl,
2449}; 2449};
2450 2450
2451const struct inode_operations ext3_special_inode_operations = { 2451const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
2456 .listxattr = ext3_listxattr, 2456 .listxattr = ext3_listxattr,
2457 .removexattr = generic_removexattr, 2457 .removexattr = generic_removexattr,
2458#endif 2458#endif
2459 .permission = ext3_permission, 2459 .check_acl = ext3_check_acl,
2460}; 2460};
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
37 37
38 To enable backwards compatibility so that systems that are 38 To enable backwards compatibility so that systems that are
39 still expecting to mount ext4 filesystems using ext4dev, 39 still expecting to mount ext4 filesystems using ext4dev,
40 chose Y here. This feature will go away by 2.6.31, so 40 choose Y here. This feature will go away by 2.6.31, so
41 please arrange to get your userspace programs fixed! 41 please arrange to get your userspace programs fixed!
42 42
43config EXT4_FS_XATTR 43config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
77 77
78 If you are not using a security module that requires using 78 If you are not using a security module that requires using
79 extended attributes for file security labels, say N. 79 extended attributes for file security labels, say N.
80
81config EXT4_DEBUG
82 bool "EXT4 debugging support"
83 depends on EXT4_FS
84 help
85 Enables run-time debugging support for the ext4 filesystem.
86
87 If you select Y here, then you will be able to turn on debugging
88 with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
236 return error; 236 return error;
237} 237}
238 238
239static int 239int
240ext4_check_acl(struct inode *inode, int mask) 240ext4_check_acl(struct inode *inode, int mask)
241{ 241{
242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); 242 struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
252 return -EAGAIN; 252 return -EAGAIN;
253} 253}
254 254
255int
256ext4_permission(struct inode *inode, int mask)
257{
258 return generic_permission(inode, mask, ext4_check_acl);
259}
260
261/* 255/*
262 * Initialize the ACLs of a new inode. Called from ext4_new_inode. 256 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
263 * 257 *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
54#ifdef CONFIG_EXT4_FS_POSIX_ACL 54#ifdef CONFIG_EXT4_FS_POSIX_ACL
55 55
56/* acl.c */ 56/* acl.c */
57extern int ext4_permission(struct inode *, int); 57extern int ext4_check_acl(struct inode *, int);
58extern int ext4_acl_chmod(struct inode *); 58extern int ext4_acl_chmod(struct inode *);
59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); 59extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
60 60
61#else /* CONFIG_EXT4_FS_POSIX_ACL */ 61#else /* CONFIG_EXT4_FS_POSIX_ACL */
62#include <linux/sched.h> 62#include <linux/sched.h>
63#define ext4_permission NULL 63#define ext4_check_acl NULL
64 64
65static inline int 65static inline int
66ext4_acl_chmod(struct inode *inode) 66ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
478 * new bitmap information 478 * new bitmap information
479 */ 479 */
480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 480 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
481 ext4_mb_update_group_info(grp, blocks_freed); 481 grp->bb_free += blocks_freed;
482 up_write(&grp->alloc_sem); 482 up_write(&grp->alloc_sem);
483 483
484 /* We dirtied the bitmap block */ 484 /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
67 67
68 68
69/* prefer goal again. length */ 69/* prefer goal again. length */
70#define EXT4_MB_HINT_MERGE 1 70#define EXT4_MB_HINT_MERGE 0x0001
71/* blocks already reserved */ 71/* blocks already reserved */
72#define EXT4_MB_HINT_RESERVED 2 72#define EXT4_MB_HINT_RESERVED 0x0002
73/* metadata is being allocated */ 73/* metadata is being allocated */
74#define EXT4_MB_HINT_METADATA 4 74#define EXT4_MB_HINT_METADATA 0x0004
75/* first blocks in the file */ 75/* first blocks in the file */
76#define EXT4_MB_HINT_FIRST 8 76#define EXT4_MB_HINT_FIRST 0x0008
77/* search for the best chunk */ 77/* search for the best chunk */
78#define EXT4_MB_HINT_BEST 16 78#define EXT4_MB_HINT_BEST 0x0010
79/* data is being allocated */ 79/* data is being allocated */
80#define EXT4_MB_HINT_DATA 32 80#define EXT4_MB_HINT_DATA 0x0020
81/* don't preallocate (for tails) */ 81/* don't preallocate (for tails) */
82#define EXT4_MB_HINT_NOPREALLOC 64 82#define EXT4_MB_HINT_NOPREALLOC 0x0040
83/* allocate for locality group */ 83/* allocate for locality group */
84#define EXT4_MB_HINT_GROUP_ALLOC 128 84#define EXT4_MB_HINT_GROUP_ALLOC 0x0080
85/* allocate goal blocks or none */ 85/* allocate goal blocks or none */
86#define EXT4_MB_HINT_GOAL_ONLY 256 86#define EXT4_MB_HINT_GOAL_ONLY 0x0100
87/* goal is meaningful */ 87/* goal is meaningful */
88#define EXT4_MB_HINT_TRY_GOAL 512 88#define EXT4_MB_HINT_TRY_GOAL 0x0200
89/* blocks already pre-reserved by delayed allocation */ 89/* blocks already pre-reserved by delayed allocation */
90#define EXT4_MB_DELALLOC_RESERVED 1024 90#define EXT4_MB_DELALLOC_RESERVED 0x0400
91/* We are doing stream allocation */
92#define EXT4_MB_STREAM_ALLOC 0x0800
91 93
92 94
93struct ext4_allocation_request { 95struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
112}; 114};
113 115
114/* 116/*
117 * For delayed allocation tracking
118 */
119struct mpage_da_data {
120 struct inode *inode;
121 sector_t b_blocknr; /* start block number of extent */
122 size_t b_size; /* size of extent */
123 unsigned long b_state; /* state of the extent */
124 unsigned long first_page, next_page; /* extent of pages */
125 struct writeback_control *wbc;
126 int io_done;
127 int pages_written;
128 int retval;
129};
130
131/*
115 * Special inodes numbers 132 * Special inodes numbers
116 */ 133 */
117#define EXT4_BAD_INO 1 /* Bad blocks inode */ 134#define EXT4_BAD_INO 1 /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
251#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ 268#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
252#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ 269#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */
253#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ 270#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
254#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */
255#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ 271#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
256 272
257#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ 273#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
289#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ 305#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
290#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ 306#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
291#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ 307#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
308#define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */
292 309
293/* Used to pass group descriptor data when online resize is done */ 310/* Used to pass group descriptor data when online resize is done */
294struct ext4_new_group_input { 311struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
386#endif 403#endif
387}; 404};
388 405
406/* Max physical block we can addres w/o extents */
407#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
408
389/* 409/*
390 * Structure of an inode on the disk 410 * Structure of an inode on the disk
391 */ 411 */
@@ -456,7 +476,6 @@ struct move_extent {
456 __u64 len; /* block length to be moved */ 476 __u64 len; /* block length to be moved */
457 __u64 moved_len; /* moved block length */ 477 __u64 moved_len; /* moved block length */
458}; 478};
459#define MAX_DEFRAG_SIZE ((1UL<<31) - 1)
460 479
461#define EXT4_EPOCH_BITS 2 480#define EXT4_EPOCH_BITS 2
462#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) 481#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
694#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ 713#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
695#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ 714#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
696#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ 715#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
697#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
698#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ 716#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
699#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ 717#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
700#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 718#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
841 unsigned long s_gdb_count; /* Number of group descriptor blocks */ 859 unsigned long s_gdb_count; /* Number of group descriptor blocks */
842 unsigned long s_desc_per_block; /* Number of group descriptors per block */ 860 unsigned long s_desc_per_block; /* Number of group descriptors per block */
843 ext4_group_t s_groups_count; /* Number of groups in the fs */ 861 ext4_group_t s_groups_count; /* Number of groups in the fs */
862 ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
844 unsigned long s_overhead_last; /* Last calculated overhead */ 863 unsigned long s_overhead_last; /* Last calculated overhead */
845 unsigned long s_blocks_last; /* Last seen block count */ 864 unsigned long s_blocks_last; /* Last seen block count */
846 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 865 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
950 atomic_t s_mb_lost_chunks; 969 atomic_t s_mb_lost_chunks;
951 atomic_t s_mb_preallocated; 970 atomic_t s_mb_preallocated;
952 atomic_t s_mb_discarded; 971 atomic_t s_mb_discarded;
972 atomic_t s_lock_busy;
953 973
954 /* locality groups */ 974 /* locality groups */
955 struct ext4_locality_group *s_locality_groups; 975 struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1340 ext4_fsblk_t, unsigned long, int, unsigned long *); 1360 ext4_fsblk_t, unsigned long, int, unsigned long *);
1341extern int ext4_mb_add_groupinfo(struct super_block *sb, 1361extern int ext4_mb_add_groupinfo(struct super_block *sb,
1342 ext4_group_t i, struct ext4_group_desc *desc); 1362 ext4_group_t i, struct ext4_group_desc *desc);
1343extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1344 ext4_grpblk_t add);
1345extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1363extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1346extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1364extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1347 ext4_group_t, int); 1365 ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
1367extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 1385extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1368extern int ext4_can_truncate(struct inode *inode); 1386extern int ext4_can_truncate(struct inode *inode);
1369extern void ext4_truncate(struct inode *); 1387extern void ext4_truncate(struct inode *);
1388extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
1370extern void ext4_set_inode_flags(struct inode *); 1389extern void ext4_set_inode_flags(struct inode *);
1371extern void ext4_get_inode_flags(struct ext4_inode_info *); 1390extern void ext4_get_inode_flags(struct ext4_inode_info *);
1372extern int ext4_alloc_da_blocks(struct inode *inode); 1391extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
1575struct ext4_group_info { 1594struct ext4_group_info {
1576 unsigned long bb_state; 1595 unsigned long bb_state;
1577 struct rb_root bb_free_root; 1596 struct rb_root bb_free_root;
1578 unsigned short bb_first_free; 1597 ext4_grpblk_t bb_first_free; /* first free block */
1579 unsigned short bb_free; 1598 ext4_grpblk_t bb_free; /* total free blocks */
1580 unsigned short bb_fragments; 1599 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1581 struct list_head bb_prealloc_list; 1600 struct list_head bb_prealloc_list;
1582#ifdef DOUBLE_CHECK 1601#ifdef DOUBLE_CHECK
1583 void *bb_bitmap; 1602 void *bb_bitmap;
1584#endif 1603#endif
1585 struct rw_semaphore alloc_sem; 1604 struct rw_semaphore alloc_sem;
1586 unsigned short bb_counters[]; 1605 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
1606 * regions, index is order.
1607 * bb_counters[3] = 5 means
1608 * 5 free 8-block regions. */
1587}; 1609};
1588 1610
1589#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 1611#define EXT4_GROUP_INFO_NEED_INIT_BIT 0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
1591#define EXT4_MB_GRP_NEED_INIT(grp) \ 1613#define EXT4_MB_GRP_NEED_INIT(grp) \
1592 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) 1614 (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
1593 1615
1616#define EXT4_MAX_CONTENTION 8
1617#define EXT4_CONTENTION_THRESHOLD 2
1618
1594static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, 1619static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
1595 ext4_group_t group) 1620 ext4_group_t group)
1596{ 1621{
1597 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); 1622 return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
1598} 1623}
1599 1624
1625/*
1626 * Returns true if the filesystem is busy enough that attempts to
1627 * access the block group locks has run into contention.
1628 */
1629static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
1630{
1631 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
1632}
1633
1600static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 1634static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
1601{ 1635{
1602 spin_lock(ext4_group_lock_ptr(sb, group)); 1636 spinlock_t *lock = ext4_group_lock_ptr(sb, group);
1637 if (spin_trylock(lock))
1638 /*
1639 * We're able to grab the lock right away, so drop the
1640 * lock contention counter.
1641 */
1642 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
1643 else {
1644 /*
1645 * The lock is busy, so bump the contention counter,
1646 * and then wait on the spin lock.
1647 */
1648 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
1649 EXT4_MAX_CONTENTION);
1650 spin_lock(lock);
1651 }
1603} 1652}
1604 1653
1605static inline void ext4_unlock_group(struct super_block *sb, 1654static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
43#define CHECK_BINSEARCH__ 43#define CHECK_BINSEARCH__
44 44
45/* 45/*
46 * If EXT_DEBUG is defined you can use the 'extdebug' mount option 46 * Turn on EXT_DEBUG to get lots of info about extents operations.
47 * to get lots of info about what's going on.
48 */ 47 */
49#define EXT_DEBUG__ 48#define EXT_DEBUG__
50#ifdef EXT_DEBUG 49#ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
138#define EXT_BREAK 1 137#define EXT_BREAK 1
139#define EXT_REPEAT 2 138#define EXT_REPEAT 2
140 139
140/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
141#define EXT_MAX_BLOCK 0xffffffff 141#define EXT_MAX_BLOCK 0xffffffff
142 142
143/* 143/*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
44 handle, err); 44 handle, err);
45 } 45 }
46 else 46 else
47 brelse(bh); 47 bforget(bh);
48 return err; 48 return err;
49} 49}
50 50
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
60 handle, err); 60 handle, err);
61 } 61 }
62 else 62 else
63 brelse(bh); 63 bforget(bh);
64 return err; 64 return err;
65} 65}
66 66
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
89 ext4_journal_abort_handle(where, __func__, bh, 89 ext4_journal_abort_handle(where, __func__, bh,
90 handle, err); 90 handle, err);
91 } else { 91 } else {
92 mark_buffer_dirty(bh); 92 if (inode && bh)
93 mark_buffer_dirty_inode(bh, inode);
94 else
95 mark_buffer_dirty(bh);
93 if (inode && inode_needs_sync(inode)) { 96 if (inode && inode_needs_sync(inode)) {
94 sync_dirty_buffer(bh); 97 sync_dirty_buffer(bh);
95 if (buffer_req(bh) && !buffer_uptodate(bh)) { 98 if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); 93 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
94} 94}
95 95
96static int ext4_ext_journal_restart(handle_t *handle, int needed) 96static int ext4_ext_truncate_extend_restart(handle_t *handle,
97 struct inode *inode,
98 int needed)
97{ 99{
98 int err; 100 int err;
99 101
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
104 err = ext4_journal_extend(handle, needed); 106 err = ext4_journal_extend(handle, needed);
105 if (err <= 0) 107 if (err <= 0)
106 return err; 108 return err;
107 return ext4_journal_restart(handle, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /*
111 * We have dropped i_data_sem so someone might have cached again
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115
116 return err;
108} 117}
109 118
110/* 119/*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
220 return newblock; 229 return newblock;
221} 230}
222 231
223static int ext4_ext_space_block(struct inode *inode) 232static inline int ext4_ext_space_block(struct inode *inode, int check)
224{ 233{
225 int size; 234 int size;
226 235
227 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 236 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
228 / sizeof(struct ext4_extent); 237 / sizeof(struct ext4_extent);
238 if (!check) {
229#ifdef AGGRESSIVE_TEST 239#ifdef AGGRESSIVE_TEST
230 if (size > 6) 240 if (size > 6)
231 size = 6; 241 size = 6;
232#endif 242#endif
243 }
233 return size; 244 return size;
234} 245}
235 246
236static int ext4_ext_space_block_idx(struct inode *inode) 247static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
237{ 248{
238 int size; 249 int size;
239 250
240 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) 251 size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
241 / sizeof(struct ext4_extent_idx); 252 / sizeof(struct ext4_extent_idx);
253 if (!check) {
242#ifdef AGGRESSIVE_TEST 254#ifdef AGGRESSIVE_TEST
243 if (size > 5) 255 if (size > 5)
244 size = 5; 256 size = 5;
245#endif 257#endif
258 }
246 return size; 259 return size;
247} 260}
248 261
249static int ext4_ext_space_root(struct inode *inode) 262static inline int ext4_ext_space_root(struct inode *inode, int check)
250{ 263{
251 int size; 264 int size;
252 265
253 size = sizeof(EXT4_I(inode)->i_data); 266 size = sizeof(EXT4_I(inode)->i_data);
254 size -= sizeof(struct ext4_extent_header); 267 size -= sizeof(struct ext4_extent_header);
255 size /= sizeof(struct ext4_extent); 268 size /= sizeof(struct ext4_extent);
269 if (!check) {
256#ifdef AGGRESSIVE_TEST 270#ifdef AGGRESSIVE_TEST
257 if (size > 3) 271 if (size > 3)
258 size = 3; 272 size = 3;
259#endif 273#endif
274 }
260 return size; 275 return size;
261} 276}
262 277
263static int ext4_ext_space_root_idx(struct inode *inode) 278static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
264{ 279{
265 int size; 280 int size;
266 281
267 size = sizeof(EXT4_I(inode)->i_data); 282 size = sizeof(EXT4_I(inode)->i_data);
268 size -= sizeof(struct ext4_extent_header); 283 size -= sizeof(struct ext4_extent_header);
269 size /= sizeof(struct ext4_extent_idx); 284 size /= sizeof(struct ext4_extent_idx);
285 if (!check) {
270#ifdef AGGRESSIVE_TEST 286#ifdef AGGRESSIVE_TEST
271 if (size > 4) 287 if (size > 4)
272 size = 4; 288 size = 4;
273#endif 289#endif
290 }
274 return size; 291 return size;
275} 292}
276 293
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
284 int lcap, icap, rcap, leafs, idxs, num; 301 int lcap, icap, rcap, leafs, idxs, num;
285 int newextents = blocks; 302 int newextents = blocks;
286 303
287 rcap = ext4_ext_space_root_idx(inode); 304 rcap = ext4_ext_space_root_idx(inode, 0);
288 lcap = ext4_ext_space_block(inode); 305 lcap = ext4_ext_space_block(inode, 0);
289 icap = ext4_ext_space_block_idx(inode); 306 icap = ext4_ext_space_block_idx(inode, 0);
290 307
291 /* number of new leaf blocks needed */ 308 /* number of new leaf blocks needed */
292 num = leafs = (newextents + lcap - 1) / lcap; 309 num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
311 328
312 if (depth == ext_depth(inode)) { 329 if (depth == ext_depth(inode)) {
313 if (depth == 0) 330 if (depth == 0)
314 max = ext4_ext_space_root(inode); 331 max = ext4_ext_space_root(inode, 1);
315 else 332 else
316 max = ext4_ext_space_root_idx(inode); 333 max = ext4_ext_space_root_idx(inode, 1);
317 } else { 334 } else {
318 if (depth == 0) 335 if (depth == 0)
319 max = ext4_ext_space_block(inode); 336 max = ext4_ext_space_block(inode, 1);
320 else 337 else
321 max = ext4_ext_space_block_idx(inode); 338 max = ext4_ext_space_block_idx(inode, 1);
322 } 339 }
323 340
324 return max; 341 return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
437 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block), 454 ext_debug(" %d->%llu", le32_to_cpu(path->p_idx->ei_block),
438 idx_pblock(path->p_idx)); 455 idx_pblock(path->p_idx));
439 } else if (path->p_ext) { 456 } else if (path->p_ext) {
440 ext_debug(" %d:%d:%llu ", 457 ext_debug(" %d:[%d]%d:%llu ",
441 le32_to_cpu(path->p_ext->ee_block), 458 le32_to_cpu(path->p_ext->ee_block),
459 ext4_ext_is_uninitialized(path->p_ext),
442 ext4_ext_get_actual_len(path->p_ext), 460 ext4_ext_get_actual_len(path->p_ext),
443 ext_pblock(path->p_ext)); 461 ext_pblock(path->p_ext));
444 } else 462 } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
460 eh = path[depth].p_hdr; 478 eh = path[depth].p_hdr;
461 ex = EXT_FIRST_EXTENT(eh); 479 ex = EXT_FIRST_EXTENT(eh);
462 480
481 ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
482
463 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { 483 for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
464 ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block), 484 ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
485 ext4_ext_is_uninitialized(ex),
465 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 486 ext4_ext_get_actual_len(ex), ext_pblock(ex));
466 } 487 }
467 ext_debug("\n"); 488 ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
580 } 601 }
581 602
582 path->p_ext = l - 1; 603 path->p_ext = l - 1;
583 ext_debug(" -> %d:%llu:%d ", 604 ext_debug(" -> %d:%llu:[%d]%d ",
584 le32_to_cpu(path->p_ext->ee_block), 605 le32_to_cpu(path->p_ext->ee_block),
585 ext_pblock(path->p_ext), 606 ext_pblock(path->p_ext),
607 ext4_ext_is_uninitialized(path->p_ext),
586 ext4_ext_get_actual_len(path->p_ext)); 608 ext4_ext_get_actual_len(path->p_ext));
587 609
588#ifdef CHECK_BINSEARCH 610#ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
612 eh->eh_depth = 0; 634 eh->eh_depth = 0;
613 eh->eh_entries = 0; 635 eh->eh_entries = 0;
614 eh->eh_magic = EXT4_EXT_MAGIC; 636 eh->eh_magic = EXT4_EXT_MAGIC;
615 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode)); 637 eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
616 ext4_mark_inode_dirty(handle, inode); 638 ext4_mark_inode_dirty(handle, inode);
617 ext4_ext_invalidate_cache(inode); 639 ext4_ext_invalidate_cache(inode);
618 return 0; 640 return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
837 859
838 neh = ext_block_hdr(bh); 860 neh = ext_block_hdr(bh);
839 neh->eh_entries = 0; 861 neh->eh_entries = 0;
840 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 862 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
841 neh->eh_magic = EXT4_EXT_MAGIC; 863 neh->eh_magic = EXT4_EXT_MAGIC;
842 neh->eh_depth = 0; 864 neh->eh_depth = 0;
843 ex = EXT_FIRST_EXTENT(neh); 865 ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
850 path[depth].p_ext++; 872 path[depth].p_ext++;
851 while (path[depth].p_ext <= 873 while (path[depth].p_ext <=
852 EXT_MAX_EXTENT(path[depth].p_hdr)) { 874 EXT_MAX_EXTENT(path[depth].p_hdr)) {
853 ext_debug("move %d:%llu:%d in new leaf %llu\n", 875 ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
854 le32_to_cpu(path[depth].p_ext->ee_block), 876 le32_to_cpu(path[depth].p_ext->ee_block),
855 ext_pblock(path[depth].p_ext), 877 ext_pblock(path[depth].p_ext),
878 ext4_ext_is_uninitialized(path[depth].p_ext),
856 ext4_ext_get_actual_len(path[depth].p_ext), 879 ext4_ext_get_actual_len(path[depth].p_ext),
857 newblock); 880 newblock);
858 /*memmove(ex++, path[depth].p_ext++, 881 /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
912 neh = ext_block_hdr(bh); 935 neh = ext_block_hdr(bh);
913 neh->eh_entries = cpu_to_le16(1); 936 neh->eh_entries = cpu_to_le16(1);
914 neh->eh_magic = EXT4_EXT_MAGIC; 937 neh->eh_magic = EXT4_EXT_MAGIC;
915 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 938 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
916 neh->eh_depth = cpu_to_le16(depth - i); 939 neh->eh_depth = cpu_to_le16(depth - i);
917 fidx = EXT_FIRST_INDEX(neh); 940 fidx = EXT_FIRST_INDEX(neh);
918 fidx->ei_block = border; 941 fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1037 /* old root could have indexes or leaves 1060 /* old root could have indexes or leaves
1038 * so calculate e_max right way */ 1061 * so calculate e_max right way */
1039 if (ext_depth(inode)) 1062 if (ext_depth(inode))
1040 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode)); 1063 neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
1041 else 1064 else
1042 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode)); 1065 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
1043 neh->eh_magic = EXT4_EXT_MAGIC; 1066 neh->eh_magic = EXT4_EXT_MAGIC;
1044 set_buffer_uptodate(bh); 1067 set_buffer_uptodate(bh);
1045 unlock_buffer(bh); 1068 unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1054 goto out; 1077 goto out;
1055 1078
1056 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC; 1079 curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
1057 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode)); 1080 curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
1058 curp->p_hdr->eh_entries = cpu_to_le16(1); 1081 curp->p_hdr->eh_entries = cpu_to_le16(1);
1059 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); 1082 curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
1060 1083
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
1580 1603
1581 /* try to insert block into found extent and return */ 1604 /* try to insert block into found extent and return */
1582 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) { 1605 if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
1583 ext_debug("append %d block to %d:%d (from %llu)\n", 1606 ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
1607 ext4_ext_is_uninitialized(newext),
1584 ext4_ext_get_actual_len(newext), 1608 ext4_ext_get_actual_len(newext),
1585 le32_to_cpu(ex->ee_block), 1609 le32_to_cpu(ex->ee_block),
1610 ext4_ext_is_uninitialized(ex),
1586 ext4_ext_get_actual_len(ex), ext_pblock(ex)); 1611 ext4_ext_get_actual_len(ex), ext_pblock(ex));
1587 err = ext4_ext_get_access(handle, inode, path + depth); 1612 err = ext4_ext_get_access(handle, inode, path + depth);
1588 if (err) 1613 if (err)
@@ -1651,9 +1676,10 @@ has_space:
1651 1676
1652 if (!nearex) { 1677 if (!nearex) {
1653 /* there is no extent in this leaf, create first one */ 1678 /* there is no extent in this leaf, create first one */
1654 ext_debug("first extent in the leaf: %d:%llu:%d\n", 1679 ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
1655 le32_to_cpu(newext->ee_block), 1680 le32_to_cpu(newext->ee_block),
1656 ext_pblock(newext), 1681 ext_pblock(newext),
1682 ext4_ext_is_uninitialized(newext),
1657 ext4_ext_get_actual_len(newext)); 1683 ext4_ext_get_actual_len(newext));
1658 path[depth].p_ext = EXT_FIRST_EXTENT(eh); 1684 path[depth].p_ext = EXT_FIRST_EXTENT(eh);
1659 } else if (le32_to_cpu(newext->ee_block) 1685 } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
1663 len = EXT_MAX_EXTENT(eh) - nearex; 1689 len = EXT_MAX_EXTENT(eh) - nearex;
1664 len = (len - 1) * sizeof(struct ext4_extent); 1690 len = (len - 1) * sizeof(struct ext4_extent);
1665 len = len < 0 ? 0 : len; 1691 len = len < 0 ? 0 : len;
1666 ext_debug("insert %d:%llu:%d after: nearest 0x%p, " 1692 ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
1667 "move %d from 0x%p to 0x%p\n", 1693 "move %d from 0x%p to 0x%p\n",
1668 le32_to_cpu(newext->ee_block), 1694 le32_to_cpu(newext->ee_block),
1669 ext_pblock(newext), 1695 ext_pblock(newext),
1696 ext4_ext_is_uninitialized(newext),
1670 ext4_ext_get_actual_len(newext), 1697 ext4_ext_get_actual_len(newext),
1671 nearex, len, nearex + 1, nearex + 2); 1698 nearex, len, nearex + 1, nearex + 2);
1672 memmove(nearex + 2, nearex + 1, len); 1699 memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
1676 BUG_ON(newext->ee_block == nearex->ee_block); 1703 BUG_ON(newext->ee_block == nearex->ee_block);
1677 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent); 1704 len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
1678 len = len < 0 ? 0 : len; 1705 len = len < 0 ? 0 : len;
1679 ext_debug("insert %d:%llu:%d before: nearest 0x%p, " 1706 ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
1680 "move %d from 0x%p to 0x%p\n", 1707 "move %d from 0x%p to 0x%p\n",
1681 le32_to_cpu(newext->ee_block), 1708 le32_to_cpu(newext->ee_block),
1682 ext_pblock(newext), 1709 ext_pblock(newext),
1710 ext4_ext_is_uninitialized(newext),
1683 ext4_ext_get_actual_len(newext), 1711 ext4_ext_get_actual_len(newext),
1684 nearex, len, nearex + 1, nearex + 2); 1712 nearex, len, nearex + 1, nearex + 2);
1685 memmove(nearex + 1, nearex, len); 1713 memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2094 else 2122 else
2095 uninitialized = 0; 2123 uninitialized = 0;
2096 2124
2097 ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len); 2125 ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
2126 uninitialized, ex_ee_len);
2098 path[depth].p_ext = ex; 2127 path[depth].p_ext = ex;
2099 2128
2100 a = ex_ee_block > start ? ex_ee_block : start; 2129 a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2138 } 2167 }
2139 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2168 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
2140 2169
2141 err = ext4_ext_journal_restart(handle, credits); 2170 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2142 if (err) 2171 if (err)
2143 goto out; 2172 goto out;
2144 2173
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2327 if (err == 0) { 2356 if (err == 0) {
2328 ext_inode_hdr(inode)->eh_depth = 0; 2357 ext_inode_hdr(inode)->eh_depth = 0;
2329 ext_inode_hdr(inode)->eh_max = 2358 ext_inode_hdr(inode)->eh_max =
2330 cpu_to_le16(ext4_ext_space_root(inode)); 2359 cpu_to_le16(ext4_ext_space_root(inode, 0));
2331 err = ext4_ext_dirty(handle, inode, path); 2360 err = ext4_ext_dirty(handle, inode, path);
2332 } 2361 }
2333 } 2362 }
@@ -2743,6 +2772,7 @@ insert:
2743 } else if (err) 2772 } else if (err)
2744 goto fix_extent_len; 2773 goto fix_extent_len;
2745out: 2774out:
2775 ext4_ext_show_leaf(inode, path);
2746 return err ? err : allocated; 2776 return err ? err : allocated;
2747 2777
2748fix_extent_len: 2778fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2786 struct ext4_allocation_request ar; 2816 struct ext4_allocation_request ar;
2787 2817
2788 __clear_bit(BH_New, &bh_result->b_state); 2818 __clear_bit(BH_New, &bh_result->b_state);
2789 ext_debug("blocks %u/%u requested for inode %u\n", 2819 ext_debug("blocks %u/%u requested for inode %lu\n",
2790 iblock, max_blocks, inode->i_ino); 2820 iblock, max_blocks, inode->i_ino);
2791 2821
2792 /* check in cache */ 2822 /* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2849 newblock = iblock - ee_block + ee_start; 2879 newblock = iblock - ee_block + ee_start;
2850 /* number of remaining blocks in the extent */ 2880 /* number of remaining blocks in the extent */
2851 allocated = ee_len - (iblock - ee_block); 2881 allocated = ee_len - (iblock - ee_block);
2852 ext_debug("%u fit into %lu:%d -> %llu\n", iblock, 2882 ext_debug("%u fit into %u:%d -> %llu\n", iblock,
2853 ee_block, ee_len, newblock); 2883 ee_block, ee_len, newblock);
2854 2884
2855 /* Do not put uninitialized extent in the cache */ 2885 /* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
2950 newblock = ext4_mb_new_blocks(handle, &ar, &err); 2980 newblock = ext4_mb_new_blocks(handle, &ar, &err);
2951 if (!newblock) 2981 if (!newblock)
2952 goto out2; 2982 goto out2;
2953 ext_debug("allocate new block: goal %llu, found %llu/%lu\n", 2983 ext_debug("allocate new block: goal %llu, found %llu/%u\n",
2954 ar.goal, newblock, allocated); 2984 ar.goal, newblock, allocated);
2955 2985
2956 /* try to insert new extent into found leaf and return */ 2986 /* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
58ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 58ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
59 unsigned long nr_segs, loff_t pos) 59 unsigned long nr_segs, loff_t pos)
60{ 60{
61 struct file *file = iocb->ki_filp; 61 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
62 struct inode *inode = file->f_path.dentry->d_inode;
63 ssize_t ret;
64 int err;
65 62
66 /* 63 /*
67 * If we have encountered a bitmap-format file, the size limit 64 * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
81 } 78 }
82 } 79 }
83 80
84 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 81 return generic_file_aio_write(iocb, iov, nr_segs, pos);
85 /*
86 * Skip flushing if there was an error, or if nothing was written.
87 */
88 if (ret <= 0)
89 return ret;
90
91 /*
92 * If the inode is IS_SYNC, or is O_SYNC and we are doing data
93 * journalling then we need to make sure that we force the transaction
94 * to disk to keep all metadata uptodate synchronously.
95 */
96 if (file->f_flags & O_SYNC) {
97 /*
98 * If we are non-data-journaled, then the dirty data has
99 * already been flushed to backing store by generic_osync_inode,
100 * and the inode has been flushed too if there have been any
101 * modifications other than mere timestamp updates.
102 *
103 * Open question --- do we care about flushing timestamps too
104 * if the inode is IS_SYNC?
105 */
106 if (!ext4_should_journal_data(inode))
107 return ret;
108
109 goto force_commit;
110 }
111
112 /*
113 * So we know that there has been no forced data flush. If the inode
114 * is marked IS_SYNC, we need to force one ourselves.
115 */
116 if (!IS_SYNC(inode))
117 return ret;
118
119 /*
120 * Open question #2 --- should we force data to disk here too? If we
121 * don't, the only impact is that data=writeback filesystems won't
122 * flush data to disk automatically on IS_SYNC, only metadata (but
123 * historically, that is what ext2 has done.)
124 */
125
126force_commit:
127 err = ext4_force_commit(inode->i_sb);
128 if (err)
129 return err;
130 return ret;
131} 82}
132 83
133static struct vm_operations_struct ext4_file_vm_ops = { 84static struct vm_operations_struct ext4_file_vm_ops = {
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
207 .listxattr = ext4_listxattr, 158 .listxattr = ext4_listxattr,
208 .removexattr = generic_removexattr, 159 .removexattr = generic_removexattr,
209#endif 160#endif
210 .permission = ext4_permission, 161 .check_acl = ext4_check_acl,
211 .fallocate = ext4_fallocate, 162 .fallocate = ext4_fallocate,
212 .fiemap = ext4_fiemap, 163 .fiemap = ext4_fiemap,
213}; 164};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
50{ 50{
51 struct inode *inode = dentry->d_inode; 51 struct inode *inode = dentry->d_inode;
52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 52 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
53 int ret = 0; 53 int err, ret = 0;
54 54
55 J_ASSERT(ext4_journal_current_handle() == NULL); 55 J_ASSERT(ext4_journal_current_handle() == NULL);
56 56
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 goto out; 79 goto out;
80 } 80 }
81 81
82 if (!journal)
83 ret = sync_mapping_buffers(inode->i_mapping);
84
82 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 85 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
83 goto out; 86 goto out;
84 87
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
91 .sync_mode = WB_SYNC_ALL, 94 .sync_mode = WB_SYNC_ALL,
92 .nr_to_write = 0, /* sys_fsync did this */ 95 .nr_to_write = 0, /* sys_fsync did this */
93 }; 96 };
94 ret = sync_inode(inode, &wbc); 97 err = sync_inode(inode, &wbc);
95 if (journal && (journal->j_flags & JBD2_BARRIER)) 98 if (ret == 0)
96 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 99 ret = err;
97 } 100 }
98out: 101out:
102 if (journal && (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
99 return ret; 104 return ret;
100} 105}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
1189 1189
1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); 1190 x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", 1191 printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
1192 i, ext4_free_inodes_count(sb, gdp), x); 1192 (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
1193 bitmap_count += x; 1193 bitmap_count += x;
1194 } 1194 }
1195 brelse(bitmap_bh); 1195 brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
192 * so before we call here everything must be consistently dirtied against 192 * so before we call here everything must be consistently dirtied against
193 * this transaction. 193 * this transaction.
194 */ 194 */
195static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) 195 int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
196 int nblocks)
196{ 197{
198 int ret;
199
200 /*
201 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
202 * moment, get_block can be called only for blocks inside i_size since
203 * page cache has been already dropped and writes are blocked by
204 * i_mutex. So we can safely drop the i_data_sem here.
205 */
197 BUG_ON(EXT4_JOURNAL(inode) == NULL); 206 BUG_ON(EXT4_JOURNAL(inode) == NULL);
198 jbd_debug(2, "restarting handle %p\n", handle); 207 jbd_debug(2, "restarting handle %p\n", handle);
199 return ext4_journal_restart(handle, blocks_for_truncate(inode)); 208 up_write(&EXT4_I(inode)->i_data_sem);
209 ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
210 down_write(&EXT4_I(inode)->i_data_sem);
211
212 return ret;
200} 213}
201 214
202/* 215/*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
341 int n = 0; 354 int n = 0;
342 int final = 0; 355 int final = 0;
343 356
344 if (i_block < 0) { 357 if (i_block < direct_blocks) {
345 ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
346 } else if (i_block < direct_blocks) {
347 offsets[n++] = i_block; 358 offsets[n++] = i_block;
348 final = direct_blocks; 359 final = direct_blocks;
349 } else if ((i_block -= direct_blocks) < indirect_blocks) { 360 } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
551 * 562 *
552 * Normally this function find the preferred place for block allocation, 563 * Normally this function find the preferred place for block allocation,
553 * returns it. 564 * returns it.
565 * Because this is only used for non-extent files, we limit the block nr
566 * to 32 bits.
554 */ 567 */
555static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 568static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
556 Indirect *partial) 569 Indirect *partial)
557{ 570{
571 ext4_fsblk_t goal;
572
558 /* 573 /*
559 * XXX need to get goal block from mballoc's data structures 574 * XXX need to get goal block from mballoc's data structures
560 */ 575 */
561 576
562 return ext4_find_near(inode, partial); 577 goal = ext4_find_near(inode, partial);
578 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
579 return goal;
563} 580}
564 581
565/** 582/**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
640 if (*err) 657 if (*err)
641 goto failed_out; 658 goto failed_out;
642 659
660 BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
661
643 target -= count; 662 target -= count;
644 /* allocate blocks for indirect blocks */ 663 /* allocate blocks for indirect blocks */
645 while (index < indirect_blks && count) { 664 while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
674 ar.flags = EXT4_MB_HINT_DATA; 693 ar.flags = EXT4_MB_HINT_DATA;
675 694
676 current_block = ext4_mb_new_blocks(handle, &ar, err); 695 current_block = ext4_mb_new_blocks(handle, &ar, err);
696 BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
677 697
678 if (*err && (target == blks)) { 698 if (*err && (target == blks)) {
679 /* 699 /*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
762 BUFFER_TRACE(bh, "call get_create_access"); 782 BUFFER_TRACE(bh, "call get_create_access");
763 err = ext4_journal_get_create_access(handle, bh); 783 err = ext4_journal_get_create_access(handle, bh);
764 if (err) { 784 if (err) {
785 /* Don't brelse(bh) here; it's done in
786 * ext4_journal_forget() below */
765 unlock_buffer(bh); 787 unlock_buffer(bh);
766 brelse(bh);
767 goto failed; 788 goto failed;
768 } 789 }
769 790
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
1109 ext4_discard_preallocations(inode); 1130 ext4_discard_preallocations(inode);
1110} 1131}
1111 1132
1112static int check_block_validity(struct inode *inode, sector_t logical, 1133static int check_block_validity(struct inode *inode, const char *msg,
1113 sector_t phys, int len) 1134 sector_t logical, sector_t phys, int len)
1114{ 1135{
1115 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1136 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
1116 ext4_error(inode->i_sb, "check_block_validity", 1137 ext4_error(inode->i_sb, msg,
1117 "inode #%lu logical block %llu mapped to %llu " 1138 "inode #%lu logical block %llu mapped to %llu "
1118 "(size %d)", inode->i_ino, 1139 "(size %d)", inode->i_ino,
1119 (unsigned long long) logical, 1140 (unsigned long long) logical,
1120 (unsigned long long) phys, len); 1141 (unsigned long long) phys, len);
1121 WARN_ON(1);
1122 return -EIO; 1142 return -EIO;
1123 } 1143 }
1124 return 0; 1144 return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1170 up_read((&EXT4_I(inode)->i_data_sem)); 1190 up_read((&EXT4_I(inode)->i_data_sem));
1171 1191
1172 if (retval > 0 && buffer_mapped(bh)) { 1192 if (retval > 0 && buffer_mapped(bh)) {
1173 int ret = check_block_validity(inode, block, 1193 int ret = check_block_validity(inode, "file system corruption",
1174 bh->b_blocknr, retval); 1194 block, bh->b_blocknr, retval);
1175 if (ret != 0) 1195 if (ret != 0)
1176 return ret; 1196 return ret;
1177 } 1197 }
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1235 * i_data's format changing. Force the migrate 1255 * i_data's format changing. Force the migrate
1236 * to fail by clearing migrate flags 1256 * to fail by clearing migrate flags
1237 */ 1257 */
1238 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 1258 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
1239 ~EXT4_EXT_MIGRATE;
1240 } 1259 }
1241 } 1260 }
1242 1261
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1252 1271
1253 up_write((&EXT4_I(inode)->i_data_sem)); 1272 up_write((&EXT4_I(inode)->i_data_sem));
1254 if (retval > 0 && buffer_mapped(bh)) { 1273 if (retval > 0 && buffer_mapped(bh)) {
1255 int ret = check_block_validity(inode, block, 1274 int ret = check_block_validity(inode, "file system "
1256 bh->b_blocknr, retval); 1275 "corruption after allocation",
1276 block, bh->b_blocknr, retval);
1257 if (ret != 0) 1277 if (ret != 0)
1258 return ret; 1278 return ret;
1259 } 1279 }
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
1863 * Delayed allocation stuff 1883 * Delayed allocation stuff
1864 */ 1884 */
1865 1885
1866struct mpage_da_data {
1867 struct inode *inode;
1868 sector_t b_blocknr; /* start block number of extent */
1869 size_t b_size; /* size of extent */
1870 unsigned long b_state; /* state of the extent */
1871 unsigned long first_page, next_page; /* extent of pages */
1872 struct writeback_control *wbc;
1873 int io_done;
1874 int pages_written;
1875 int retval;
1876};
1877
1878/* 1886/*
1879 * mpage_da_submit_io - walks through extent of pages and try to write 1887 * mpage_da_submit_io - walks through extent of pages and try to write
1880 * them with writepage() call back 1888 * them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
2737 long pages_skipped; 2745 long pages_skipped;
2738 int range_cyclic, cycled = 1, io_done = 0; 2746 int range_cyclic, cycled = 1, io_done = 0;
2739 int needed_blocks, ret = 0, nr_to_writebump = 0; 2747 int needed_blocks, ret = 0, nr_to_writebump = 0;
2748 loff_t range_start = wbc->range_start;
2740 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2749 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2741 2750
2742 trace_ext4_da_writepages(inode, wbc); 2751 trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
2850 mpd.io_done = 1; 2859 mpd.io_done = 1;
2851 ret = MPAGE_DA_EXTENT_TAIL; 2860 ret = MPAGE_DA_EXTENT_TAIL;
2852 } 2861 }
2862 trace_ext4_da_write_pages(inode, &mpd);
2853 wbc->nr_to_write -= mpd.pages_written; 2863 wbc->nr_to_write -= mpd.pages_written;
2854 2864
2855 ext4_journal_stop(handle); 2865 ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
2905 if (!no_nrwrite_index_update) 2915 if (!no_nrwrite_index_update)
2906 wbc->no_nrwrite_index_update = 0; 2916 wbc->no_nrwrite_index_update = 0;
2907 wbc->nr_to_write -= nr_to_writebump; 2917 wbc->nr_to_write -= nr_to_writebump;
2918 wbc->range_start = range_start;
2908 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 2919 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2909 return ret; 2920 return ret;
2910} 2921}
@@ -3117,6 +3128,8 @@ out:
3117 */ 3128 */
3118int ext4_alloc_da_blocks(struct inode *inode) 3129int ext4_alloc_da_blocks(struct inode *inode)
3119{ 3130{
3131 trace_ext4_alloc_da_blocks(inode);
3132
3120 if (!EXT4_I(inode)->i_reserved_data_blocks && 3133 if (!EXT4_I(inode)->i_reserved_data_blocks &&
3121 !EXT4_I(inode)->i_reserved_meta_blocks) 3134 !EXT4_I(inode)->i_reserved_meta_blocks)
3122 return 0; 3135 return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
3659 ext4_handle_dirty_metadata(handle, inode, bh); 3672 ext4_handle_dirty_metadata(handle, inode, bh);
3660 } 3673 }
3661 ext4_mark_inode_dirty(handle, inode); 3674 ext4_mark_inode_dirty(handle, inode);
3662 ext4_journal_test_restart(handle, inode); 3675 ext4_truncate_restart_trans(handle, inode,
3676 blocks_for_truncate(inode));
3663 if (bh) { 3677 if (bh) {
3664 BUFFER_TRACE(bh, "retaking write access"); 3678 BUFFER_TRACE(bh, "retaking write access");
3665 ext4_journal_get_write_access(handle, bh); 3679 ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
3870 return; 3884 return;
3871 if (try_to_extend_transaction(handle, inode)) { 3885 if (try_to_extend_transaction(handle, inode)) {
3872 ext4_mark_inode_dirty(handle, inode); 3886 ext4_mark_inode_dirty(handle, inode);
3873 ext4_journal_test_restart(handle, inode); 3887 ext4_truncate_restart_trans(handle, inode,
3888 blocks_for_truncate(inode));
3874 } 3889 }
3875 3890
3876 ext4_free_blocks(handle, inode, nr, 1, 1); 3891 ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
3958 if (!ext4_can_truncate(inode)) 3973 if (!ext4_can_truncate(inode))
3959 return; 3974 return;
3960 3975
3961 if (ei->i_disksize && inode->i_size == 0 && 3976 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3962 !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
3963 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE; 3977 ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
3964 3978
3965 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 3979 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
4533 */ 4547 */
4534static int ext4_do_update_inode(handle_t *handle, 4548static int ext4_do_update_inode(handle_t *handle,
4535 struct inode *inode, 4549 struct inode *inode,
4536 struct ext4_iloc *iloc) 4550 struct ext4_iloc *iloc,
4551 int do_sync)
4537{ 4552{
4538 struct ext4_inode *raw_inode = ext4_raw_inode(iloc); 4553 struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
4539 struct ext4_inode_info *ei = EXT4_I(inode); 4554 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
4581 if (ext4_inode_blocks_set(handle, raw_inode, ei)) 4596 if (ext4_inode_blocks_set(handle, raw_inode, ei))
4582 goto out_brelse; 4597 goto out_brelse;
4583 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); 4598 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
4584 /* clear the migrate flag in the raw_inode */ 4599 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
4585 raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
4586 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 4600 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
4587 cpu_to_le32(EXT4_OS_HURD)) 4601 cpu_to_le32(EXT4_OS_HURD))
4588 raw_inode->i_file_acl_high = 4602 raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
4635 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); 4649 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
4636 } 4650 }
4637 4651
4638 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4652 /*
4639 rc = ext4_handle_dirty_metadata(handle, inode, bh); 4653 * If we're not using a journal and we were called from
4640 if (!err) 4654 * ext4_write_inode() to sync the inode (making do_sync true),
4641 err = rc; 4655 * we can just use sync_dirty_buffer() directly to do our dirty
4656 * work. Testing s_journal here is a bit redundant but it's
4657 * worth it to avoid potential future trouble.
4658 */
4659 if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
4660 BUFFER_TRACE(bh, "call sync_dirty_buffer");
4661 sync_dirty_buffer(bh);
4662 } else {
4663 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4664 rc = ext4_handle_dirty_metadata(handle, inode, bh);
4665 if (!err)
4666 err = rc;
4667 }
4642 ei->i_state &= ~EXT4_STATE_NEW; 4668 ei->i_state &= ~EXT4_STATE_NEW;
4643 4669
4644out_brelse: 4670out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
4684 */ 4710 */
4685int ext4_write_inode(struct inode *inode, int wait) 4711int ext4_write_inode(struct inode *inode, int wait)
4686{ 4712{
4713 int err;
4714
4687 if (current->flags & PF_MEMALLOC) 4715 if (current->flags & PF_MEMALLOC)
4688 return 0; 4716 return 0;
4689 4717
4690 if (ext4_journal_current_handle()) { 4718 if (EXT4_SB(inode->i_sb)->s_journal) {
4691 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); 4719 if (ext4_journal_current_handle()) {
4692 dump_stack(); 4720 jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
4693 return -EIO; 4721 dump_stack();
4694 } 4722 return -EIO;
4723 }
4695 4724
4696 if (!wait) 4725 if (!wait)
4697 return 0; 4726 return 0;
4727
4728 err = ext4_force_commit(inode->i_sb);
4729 } else {
4730 struct ext4_iloc iloc;
4698 4731
4699 return ext4_force_commit(inode->i_sb); 4732 err = ext4_get_inode_loc(inode, &iloc);
4733 if (err)
4734 return err;
4735 err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
4736 inode, &iloc, wait);
4737 }
4738 return err;
4700} 4739}
4701 4740
4702/* 4741/*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
4990 get_bh(iloc->bh); 5029 get_bh(iloc->bh);
4991 5030
4992 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */ 5031 /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
4993 err = ext4_do_update_inode(handle, inode, iloc); 5032 err = ext4_do_update_inode(handle, inode, iloc, 0);
4994 put_bh(iloc->bh); 5033 put_bh(iloc->bh);
4995 return err; 5034 return err;
4996} 5035}
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5281 else 5320 else
5282 len = PAGE_CACHE_SIZE; 5321 len = PAGE_CACHE_SIZE;
5283 5322
5323 lock_page(page);
5324 /*
5325 * return if we have all the buffers mapped. This avoid
5326 * the need to call write_begin/write_end which does a
5327 * journal_start/journal_stop which can block and take
5328 * long time
5329 */
5284 if (page_has_buffers(page)) { 5330 if (page_has_buffers(page)) {
5285 /* return if we have all the buffers mapped */
5286 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, 5331 if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
5287 ext4_bh_unmapped)) 5332 ext4_bh_unmapped)) {
5333 unlock_page(page);
5288 goto out_unlock; 5334 goto out_unlock;
5335 }
5289 } 5336 }
5337 unlock_page(page);
5290 /* 5338 /*
5291 * OK, we need to fill the hole... Do write_begin write_end 5339 * OK, we need to fill the hole... Do write_begin write_end
5292 * to do block allocation/reservation.We are not holding 5340 * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
243 me.donor_start, me.len, &me.moved_len); 243 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 244 fput(donor_filp);
245 245
246 if (!err) 246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 if (copy_to_user((struct move_extent *)arg, 247 return -EFAULT;
248 &me, sizeof(me))) 248
249 return -EFAULT;
250 return err; 249 return err;
251 } 250 }
252 251
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include "mballoc.h" 24#include "mballoc.h"
25#include <linux/debugfs.h>
25#include <trace/events/ext4.h> 26#include <trace/events/ext4.h>
26 27
27/* 28/*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
622 623
623/* FIXME!! need more doc */ 624/* FIXME!! need more doc */
624static void ext4_mb_mark_free_simple(struct super_block *sb, 625static void ext4_mb_mark_free_simple(struct super_block *sb,
625 void *buddy, unsigned first, int len, 626 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
626 struct ext4_group_info *grp) 627 struct ext4_group_info *grp)
627{ 628{
628 struct ext4_sb_info *sbi = EXT4_SB(sb); 629 struct ext4_sb_info *sbi = EXT4_SB(sb);
629 unsigned short min; 630 ext4_grpblk_t min;
630 unsigned short max; 631 ext4_grpblk_t max;
631 unsigned short chunk; 632 ext4_grpblk_t chunk;
632 unsigned short border; 633 unsigned short border;
633 634
634 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 635 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
662 void *buddy, void *bitmap, ext4_group_t group) 663 void *buddy, void *bitmap, ext4_group_t group)
663{ 664{
664 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 665 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
665 unsigned short max = EXT4_BLOCKS_PER_GROUP(sb); 666 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
666 unsigned short i = 0; 667 ext4_grpblk_t i = 0;
667 unsigned short first; 668 ext4_grpblk_t first;
668 unsigned short len; 669 ext4_grpblk_t len;
669 unsigned free = 0; 670 unsigned free = 0;
670 unsigned fragments = 0; 671 unsigned fragments = 0;
671 unsigned long long period = get_cycles(); 672 unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
743 char *data; 744 char *data;
744 char *bitmap; 745 char *bitmap;
745 746
746 mb_debug("init page %lu\n", page->index); 747 mb_debug(1, "init page %lu\n", page->index);
747 748
748 inode = page->mapping->host; 749 inode = page->mapping->host;
749 sb = inode->i_sb; 750 sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
822 set_bitmap_uptodate(bh[i]); 823 set_bitmap_uptodate(bh[i]);
823 bh[i]->b_end_io = end_buffer_read_sync; 824 bh[i]->b_end_io = end_buffer_read_sync;
824 submit_bh(READ, bh[i]); 825 submit_bh(READ, bh[i]);
825 mb_debug("read bitmap for group %u\n", first_group + i); 826 mb_debug(1, "read bitmap for group %u\n", first_group + i);
826 } 827 }
827 828
828 /* wait for I/O completion */ 829 /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
862 if ((first_block + i) & 1) { 863 if ((first_block + i) & 1) {
863 /* this is block of buddy */ 864 /* this is block of buddy */
864 BUG_ON(incore == NULL); 865 BUG_ON(incore == NULL);
865 mb_debug("put buddy for group %u in page %lu/%x\n", 866 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
866 group, page->index, i * blocksize); 867 group, page->index, i * blocksize);
867 grinfo = ext4_get_group_info(sb, group); 868 grinfo = ext4_get_group_info(sb, group);
868 grinfo->bb_fragments = 0; 869 grinfo->bb_fragments = 0;
869 memset(grinfo->bb_counters, 0, 870 memset(grinfo->bb_counters, 0,
870 sizeof(unsigned short)*(sb->s_blocksize_bits+2)); 871 sizeof(*grinfo->bb_counters) *
872 (sb->s_blocksize_bits+2));
871 /* 873 /*
872 * incore got set to the group block bitmap below 874 * incore got set to the group block bitmap below
873 */ 875 */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
878 } else { 880 } else {
879 /* this is block of bitmap */ 881 /* this is block of bitmap */
880 BUG_ON(incore != NULL); 882 BUG_ON(incore != NULL);
881 mb_debug("put bitmap for group %u in page %lu/%x\n", 883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
882 group, page->index, i * blocksize); 884 group, page->index, i * blocksize);
883 885
884 /* see comments in ext4_mb_put_pa() */ 886 /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
908 return err; 910 return err;
909} 911}
910 912
913static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{
916
917 int ret = 0;
918 void *bitmap;
919 int blocks_per_page;
920 int block, pnum, poff;
921 int num_grp_locked = 0;
922 struct ext4_group_info *this_grp;
923 struct ext4_sb_info *sbi = EXT4_SB(sb);
924 struct inode *inode = sbi->s_buddy_cache;
925 struct page *page = NULL, *bitmap_page = NULL;
926
927 mb_debug(1, "init group %u\n", group);
928 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
929 this_grp = ext4_get_group_info(sb, group);
930 /*
931 * This ensures that we don't reinit the buddy cache
932 * page which map to the group from which we are already
933 * allocating. If we are looking at the buddy cache we would
934 * have taken a reference using ext4_mb_load_buddy and that
935 * would have taken the alloc_sem lock.
936 */
937 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
938 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
939 /*
940 * somebody initialized the group
941 * return without doing anything
942 */
943 ret = 0;
944 goto err;
945 }
946 /*
947 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks.
949 * So for each group we need two blocks.
950 */
951 block = group * 2;
952 pnum = block / blocks_per_page;
953 poff = block % blocks_per_page;
954 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
955 if (page) {
956 BUG_ON(page->mapping != inode->i_mapping);
957 ret = ext4_mb_init_cache(page, NULL);
958 if (ret) {
959 unlock_page(page);
960 goto err;
961 }
962 unlock_page(page);
963 }
964 if (page == NULL || !PageUptodate(page)) {
965 ret = -EIO;
966 goto err;
967 }
968 mark_page_accessed(page);
969 bitmap_page = page;
970 bitmap = page_address(page) + (poff * sb->s_blocksize);
971
972 /* init buddy cache */
973 block++;
974 pnum = block / blocks_per_page;
975 poff = block % blocks_per_page;
976 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
977 if (page == bitmap_page) {
978 /*
979 * If both the bitmap and buddy are in
980 * the same page we don't need to force
981 * init the buddy
982 */
983 unlock_page(page);
984 } else if (page) {
985 BUG_ON(page->mapping != inode->i_mapping);
986 ret = ext4_mb_init_cache(page, bitmap);
987 if (ret) {
988 unlock_page(page);
989 goto err;
990 }
991 unlock_page(page);
992 }
993 if (page == NULL || !PageUptodate(page)) {
994 ret = -EIO;
995 goto err;
996 }
997 mark_page_accessed(page);
998err:
999 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1000 if (bitmap_page)
1001 page_cache_release(bitmap_page);
1002 if (page)
1003 page_cache_release(page);
1004 return ret;
1005}
1006
911static noinline_for_stack int 1007static noinline_for_stack int
912ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
913 struct ext4_buddy *e4b) 1009 struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
922 struct ext4_sb_info *sbi = EXT4_SB(sb); 1018 struct ext4_sb_info *sbi = EXT4_SB(sb);
923 struct inode *inode = sbi->s_buddy_cache; 1019 struct inode *inode = sbi->s_buddy_cache;
924 1020
925 mb_debug("load group %u\n", group); 1021 mb_debug(1, "load group %u\n", group);
926 1022
927 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1023 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
928 grp = ext4_get_group_info(sb, group); 1024 grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
941 * groups mapped by the page is blocked 1037 * groups mapped by the page is blocked
942 * till we are done with allocation 1038 * till we are done with allocation
943 */ 1039 */
1040repeat_load_buddy:
944 down_read(e4b->alloc_semp); 1041 down_read(e4b->alloc_semp);
945 1042
1043 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1044 /* we need to check for group need init flag
1045 * with alloc_semp held so that we can be sure
1046 * that new blocks didn't get added to the group
1047 * when we are loading the buddy cache
1048 */
1049 up_read(e4b->alloc_semp);
1050 /*
1051 * we need full data about the group
1052 * to make a good selection
1053 */
1054 ret = ext4_mb_init_group(sb, group);
1055 if (ret)
1056 return ret;
1057 goto repeat_load_buddy;
1058 }
1059
946 /* 1060 /*
947 * the buddy cache inode stores the block bitmap 1061 * the buddy cache inode stores the block bitmap
948 * and buddy information in consecutive blocks. 1062 * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1360 ac->alloc_semp = e4b->alloc_semp; 1474 ac->alloc_semp = e4b->alloc_semp;
1361 e4b->alloc_semp = NULL; 1475 e4b->alloc_semp = NULL;
1362 /* store last allocated for subsequent stream allocation */ 1476 /* store last allocated for subsequent stream allocation */
1363 if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { 1477 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1364 spin_lock(&sbi->s_md_lock); 1478 spin_lock(&sbi->s_md_lock);
1365 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1479 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1366 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1480 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1837 1951
1838} 1952}
1839 1953
1840static noinline_for_stack
1841int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1842{
1843
1844 int ret;
1845 void *bitmap;
1846 int blocks_per_page;
1847 int block, pnum, poff;
1848 int num_grp_locked = 0;
1849 struct ext4_group_info *this_grp;
1850 struct ext4_sb_info *sbi = EXT4_SB(sb);
1851 struct inode *inode = sbi->s_buddy_cache;
1852 struct page *page = NULL, *bitmap_page = NULL;
1853
1854 mb_debug("init group %lu\n", group);
1855 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1856 this_grp = ext4_get_group_info(sb, group);
1857 /*
1858 * This ensures we don't add group
1859 * to this buddy cache via resize
1860 */
1861 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1862 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1863 /*
1864 * somebody initialized the group
1865 * return without doing anything
1866 */
1867 ret = 0;
1868 goto err;
1869 }
1870 /*
1871 * the buddy cache inode stores the block bitmap
1872 * and buddy information in consecutive blocks.
1873 * So for each group we need two blocks.
1874 */
1875 block = group * 2;
1876 pnum = block / blocks_per_page;
1877 poff = block % blocks_per_page;
1878 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1879 if (page) {
1880 BUG_ON(page->mapping != inode->i_mapping);
1881 ret = ext4_mb_init_cache(page, NULL);
1882 if (ret) {
1883 unlock_page(page);
1884 goto err;
1885 }
1886 unlock_page(page);
1887 }
1888 if (page == NULL || !PageUptodate(page)) {
1889 ret = -EIO;
1890 goto err;
1891 }
1892 mark_page_accessed(page);
1893 bitmap_page = page;
1894 bitmap = page_address(page) + (poff * sb->s_blocksize);
1895
1896 /* init buddy cache */
1897 block++;
1898 pnum = block / blocks_per_page;
1899 poff = block % blocks_per_page;
1900 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1901 if (page == bitmap_page) {
1902 /*
1903 * If both the bitmap and buddy are in
1904 * the same page we don't need to force
1905 * init the buddy
1906 */
1907 unlock_page(page);
1908 } else if (page) {
1909 BUG_ON(page->mapping != inode->i_mapping);
1910 ret = ext4_mb_init_cache(page, bitmap);
1911 if (ret) {
1912 unlock_page(page);
1913 goto err;
1914 }
1915 unlock_page(page);
1916 }
1917 if (page == NULL || !PageUptodate(page)) {
1918 ret = -EIO;
1919 goto err;
1920 }
1921 mark_page_accessed(page);
1922err:
1923 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1924 if (bitmap_page)
1925 page_cache_release(bitmap_page);
1926 if (page)
1927 page_cache_release(page);
1928 return ret;
1929}
1930
1931static noinline_for_stack int 1954static noinline_for_stack int
1932ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 1955ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1933{ 1956{
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1938 struct ext4_sb_info *sbi; 1961 struct ext4_sb_info *sbi;
1939 struct super_block *sb; 1962 struct super_block *sb;
1940 struct ext4_buddy e4b; 1963 struct ext4_buddy e4b;
1941 loff_t size, isize;
1942 1964
1943 sb = ac->ac_sb; 1965 sb = ac->ac_sb;
1944 sbi = EXT4_SB(sb); 1966 sbi = EXT4_SB(sb);
1945 ngroups = ext4_get_groups_count(sb); 1967 ngroups = ext4_get_groups_count(sb);
1968 /* non-extent files are limited to low blocks/groups */
1969 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
1970 ngroups = sbi->s_blockfile_groups;
1971
1946 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1972 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1947 1973
1948 /* first, try the goal */ 1974 /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1974 } 2000 }
1975 2001
1976 bsbits = ac->ac_sb->s_blocksize_bits; 2002 bsbits = ac->ac_sb->s_blocksize_bits;
1977 /* if stream allocation is enabled, use global goal */
1978 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
1979 isize = i_size_read(ac->ac_inode) >> bsbits;
1980 if (size < isize)
1981 size = isize;
1982 2003
1983 if (size < sbi->s_mb_stream_request && 2004 /* if stream allocation is enabled, use global goal */
1984 (ac->ac_flags & EXT4_MB_HINT_DATA)) { 2005 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1985 /* TBD: may be hot point */ 2006 /* TBD: may be hot point */
1986 spin_lock(&sbi->s_md_lock); 2007 spin_lock(&sbi->s_md_lock);
1987 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2008 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
1988 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2009 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
1989 spin_unlock(&sbi->s_md_lock); 2010 spin_unlock(&sbi->s_md_lock);
1990 } 2011 }
2012
1991 /* Let's just scan groups to find more-less suitable blocks */ 2013 /* Let's just scan groups to find more-less suitable blocks */
1992 cr = ac->ac_2order ? 0 : 1; 2014 cr = ac->ac_2order ? 0 : 1;
1993 /* 2015 /*
@@ -2015,27 +2037,6 @@ repeat:
2015 if (grp->bb_free == 0) 2037 if (grp->bb_free == 0)
2016 continue; 2038 continue;
2017 2039
2018 /*
2019 * if the group is already init we check whether it is
2020 * a good group and if not we don't load the buddy
2021 */
2022 if (EXT4_MB_GRP_NEED_INIT(grp)) {
2023 /*
2024 * we need full data about the group
2025 * to make a good selection
2026 */
2027 err = ext4_mb_init_group(sb, group);
2028 if (err)
2029 goto out;
2030 }
2031
2032 /*
2033 * If the particular group doesn't satisfy our
2034 * criteria we continue with the next group
2035 */
2036 if (!ext4_mb_good_group(ac, group, cr))
2037 continue;
2038
2039 err = ext4_mb_load_buddy(sb, group, &e4b); 2040 err = ext4_mb_load_buddy(sb, group, &e4b);
2040 if (err) 2041 if (err)
2041 goto out; 2042 goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2156 2157
2157 if (v == SEQ_START_TOKEN) { 2158 if (v == SEQ_START_TOKEN) {
2158 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " 2159 seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
2159 "%-5s %-2s %-5s %-5s %-5s %-6s\n", 2160 "%-5s %-2s %-6s %-5s %-5s %-6s\n",
2160 "pid", "inode", "original", "goal", "result", "found", 2161 "pid", "inode", "original", "goal", "result", "found",
2161 "grps", "cr", "flags", "merge", "tail", "broken"); 2162 "grps", "cr", "flags", "merge", "tail", "broken");
2162 return 0; 2163 return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
2164 2165
2165 if (hs->op == EXT4_MB_HISTORY_ALLOC) { 2166 if (hs->op == EXT4_MB_HISTORY_ALLOC) {
2166 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " 2167 fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
2167 "%-5u %-5s %-5u %-6u\n"; 2168 "0x%04x %-5s %-5u %-6u\n";
2168 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, 2169 sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
2169 hs->result.fe_start, hs->result.fe_len, 2170 hs->result.fe_start, hs->result.fe_len,
2170 hs->result.fe_logical); 2171 hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
2205{ 2206{
2206} 2207}
2207 2208
2208static struct seq_operations ext4_mb_seq_history_ops = { 2209static const struct seq_operations ext4_mb_seq_history_ops = {
2209 .start = ext4_mb_seq_history_start, 2210 .start = ext4_mb_seq_history_start,
2210 .next = ext4_mb_seq_history_next, 2211 .next = ext4_mb_seq_history_next,
2211 .stop = ext4_mb_seq_history_stop, 2212 .stop = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
2287 return count; 2288 return count;
2288} 2289}
2289 2290
2290static struct file_operations ext4_mb_seq_history_fops = { 2291static const struct file_operations ext4_mb_seq_history_fops = {
2291 .owner = THIS_MODULE, 2292 .owner = THIS_MODULE,
2292 .open = ext4_mb_seq_history_open, 2293 .open = ext4_mb_seq_history_open,
2293 .read = seq_read, 2294 .read = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2328 struct ext4_buddy e4b; 2329 struct ext4_buddy e4b;
2329 struct sg { 2330 struct sg {
2330 struct ext4_group_info info; 2331 struct ext4_group_info info;
2331 unsigned short counters[16]; 2332 ext4_grpblk_t counters[16];
2332 } sg; 2333 } sg;
2333 2334
2334 group--; 2335 group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2366{ 2367{
2367} 2368}
2368 2369
2369static struct seq_operations ext4_mb_seq_groups_ops = { 2370static const struct seq_operations ext4_mb_seq_groups_ops = {
2370 .start = ext4_mb_seq_groups_start, 2371 .start = ext4_mb_seq_groups_start,
2371 .next = ext4_mb_seq_groups_next, 2372 .next = ext4_mb_seq_groups_next,
2372 .stop = ext4_mb_seq_groups_stop, 2373 .stop = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2387 2388
2388} 2389}
2389 2390
2390static struct file_operations ext4_mb_seq_groups_fops = { 2391static const struct file_operations ext4_mb_seq_groups_fops = {
2391 .owner = THIS_MODULE, 2392 .owner = THIS_MODULE,
2392 .open = ext4_mb_seq_groups_open, 2393 .open = ext4_mb_seq_groups_open,
2393 .read = seq_read, 2394 .read = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2532 2533
2533 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2534 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2534 init_rwsem(&meta_group_info[i]->alloc_sem); 2535 init_rwsem(&meta_group_info[i]->alloc_sem);
2535 meta_group_info[i]->bb_free_root.rb_node = NULL;; 2536 meta_group_info[i]->bb_free_root.rb_node = NULL;
2536 2537
2537#ifdef DOUBLE_CHECK 2538#ifdef DOUBLE_CHECK
2538 { 2539 {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
2558 return -ENOMEM; 2559 return -ENOMEM;
2559} /* ext4_mb_add_groupinfo */ 2560} /* ext4_mb_add_groupinfo */
2560 2561
2561/*
2562 * Update an existing group.
2563 * This function is used for online resize
2564 */
2565void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
2566{
2567 grp->bb_free += add;
2568}
2569
2570static int ext4_mb_init_backend(struct super_block *sb) 2562static int ext4_mb_init_backend(struct super_block *sb)
2571{ 2563{
2572 ext4_group_t ngroups = ext4_get_groups_count(sb); 2564 ext4_group_t ngroups = ext4_get_groups_count(sb);
2573 ext4_group_t i; 2565 ext4_group_t i;
2574 int metalen;
2575 struct ext4_sb_info *sbi = EXT4_SB(sb); 2566 struct ext4_sb_info *sbi = EXT4_SB(sb);
2576 struct ext4_super_block *es = sbi->s_es; 2567 struct ext4_super_block *es = sbi->s_es;
2577 int num_meta_group_infos; 2568 int num_meta_group_infos;
2578 int num_meta_group_infos_max; 2569 int num_meta_group_infos_max;
2579 int array_size; 2570 int array_size;
2580 struct ext4_group_info **meta_group_info;
2581 struct ext4_group_desc *desc; 2571 struct ext4_group_desc *desc;
2582 2572
2583 /* This is the number of blocks used by GDT */ 2573 /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
2622 goto err_freesgi; 2612 goto err_freesgi;
2623 } 2613 }
2624 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2614 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2625
2626 metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
2627 for (i = 0; i < num_meta_group_infos; i++) {
2628 if ((i + 1) == num_meta_group_infos)
2629 metalen = sizeof(*meta_group_info) *
2630 (ngroups -
2631 (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
2632 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2633 if (meta_group_info == NULL) {
2634 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2635 "buddy group\n");
2636 goto err_freemeta;
2637 }
2638 sbi->s_group_info[i] = meta_group_info;
2639 }
2640
2641 for (i = 0; i < ngroups; i++) { 2615 for (i = 0; i < ngroups; i++) {
2642 desc = ext4_get_group_desc(sb, i, NULL); 2616 desc = ext4_get_group_desc(sb, i, NULL);
2643 if (desc == NULL) { 2617 if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
2655 while (i-- > 0) 2629 while (i-- > 0)
2656 kfree(ext4_get_group_info(sb, i)); 2630 kfree(ext4_get_group_info(sb, i));
2657 i = num_meta_group_infos; 2631 i = num_meta_group_infos;
2658err_freemeta:
2659 while (i-- > 0) 2632 while (i-- > 0)
2660 kfree(sbi->s_group_info[i]); 2633 kfree(sbi->s_group_info[i]);
2661 iput(sbi->s_buddy_cache); 2634 iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2672 unsigned max; 2645 unsigned max;
2673 int ret; 2646 int ret;
2674 2647
2675 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); 2648 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2676 2649
2677 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2650 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2678 if (sbi->s_mb_offsets == NULL) { 2651 if (sbi->s_mb_offsets == NULL) {
2679 return -ENOMEM; 2652 return -ENOMEM;
2680 } 2653 }
2681 2654
2682 i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); 2655 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2683 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2656 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2684 if (sbi->s_mb_maxs == NULL) { 2657 if (sbi->s_mb_maxs == NULL) {
2685 kfree(sbi->s_mb_offsets); 2658 kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2758 kmem_cache_free(ext4_pspace_cachep, pa); 2731 kmem_cache_free(ext4_pspace_cachep, pa);
2759 } 2732 }
2760 if (count) 2733 if (count)
2761 mb_debug("mballoc: %u PAs left\n", count); 2734 mb_debug(1, "mballoc: %u PAs left\n", count);
2762 2735
2763} 2736}
2764 2737
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2839 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2812 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2840 entry = list_entry(l, struct ext4_free_data, list); 2813 entry = list_entry(l, struct ext4_free_data, list);
2841 2814
2842 mb_debug("gonna free %u blocks in group %u (0x%p):", 2815 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2843 entry->count, entry->group, entry); 2816 entry->count, entry->group, entry);
2844 2817
2845 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2818 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2874 ext4_mb_release_desc(&e4b); 2847 ext4_mb_release_desc(&e4b);
2875 } 2848 }
2876 2849
2877 mb_debug("freed %u blocks in %u structures\n", count, count2); 2850 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853#ifdef CONFIG_EXT4_DEBUG
2854u8 mb_enable_debug __read_mostly;
2855
2856static struct dentry *debugfs_dir;
2857static struct dentry *debugfs_debug;
2858
2859static void __init ext4_create_debugfs_entry(void)
2860{
2861 debugfs_dir = debugfs_create_dir("ext4", NULL);
2862 if (debugfs_dir)
2863 debugfs_debug = debugfs_create_u8("mballoc-debug",
2864 S_IRUGO | S_IWUSR,
2865 debugfs_dir,
2866 &mb_enable_debug);
2867}
2868
2869static void ext4_remove_debugfs_entry(void)
2870{
2871 debugfs_remove(debugfs_debug);
2872 debugfs_remove(debugfs_dir);
2878} 2873}
2879 2874
2875#else
2876
2877static void __init ext4_create_debugfs_entry(void)
2878{
2879}
2880
2881static void ext4_remove_debugfs_entry(void)
2882{
2883}
2884
2885#endif
2886
2880int __init init_ext4_mballoc(void) 2887int __init init_ext4_mballoc(void)
2881{ 2888{
2882 ext4_pspace_cachep = 2889 ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
2904 kmem_cache_destroy(ext4_ac_cachep); 2911 kmem_cache_destroy(ext4_ac_cachep);
2905 return -ENOMEM; 2912 return -ENOMEM;
2906 } 2913 }
2914 ext4_create_debugfs_entry();
2907 return 0; 2915 return 0;
2908} 2916}
2909 2917
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
2917 kmem_cache_destroy(ext4_pspace_cachep); 2925 kmem_cache_destroy(ext4_pspace_cachep);
2918 kmem_cache_destroy(ext4_ac_cachep); 2926 kmem_cache_destroy(ext4_ac_cachep);
2919 kmem_cache_destroy(ext4_free_ext_cachep); 2927 kmem_cache_destroy(ext4_free_ext_cachep);
2928 ext4_remove_debugfs_entry();
2920} 2929}
2921 2930
2922 2931
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3061 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 3070 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
3062 else 3071 else
3063 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 3072 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3064 mb_debug("#%u: goal %u blocks for locality group\n", 3073 mb_debug(1, "#%u: goal %u blocks for locality group\n",
3065 current->pid, ac->ac_g_ex.fe_len); 3074 current->pid, ac->ac_g_ex.fe_len);
3066} 3075}
3067 3076
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3180 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3189 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3181 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3190 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3182 3191
3183 /* skip PA normalized request doesn't overlap with */ 3192 /* skip PAs this normalized request doesn't overlap with */
3184 if (pa->pa_lstart >= end) { 3193 if (pa->pa_lstart >= end || pa_end <= start) {
3185 spin_unlock(&pa->pa_lock);
3186 continue;
3187 }
3188 if (pa_end <= start) {
3189 spin_unlock(&pa->pa_lock); 3194 spin_unlock(&pa->pa_lock);
3190 continue; 3195 continue;
3191 } 3196 }
3192 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3197 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3193 3198
3199 /* adjust start or end to be adjacent to this pa */
3194 if (pa_end <= ac->ac_o_ex.fe_logical) { 3200 if (pa_end <= ac->ac_o_ex.fe_logical) {
3195 BUG_ON(pa_end < start); 3201 BUG_ON(pa_end < start);
3196 start = pa_end; 3202 start = pa_end;
3197 } 3203 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3198
3199 if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3200 BUG_ON(pa->pa_lstart > end); 3204 BUG_ON(pa->pa_lstart > end);
3201 end = pa->pa_lstart; 3205 end = pa->pa_lstart;
3202 } 3206 }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3251 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3255 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3252 } 3256 }
3253 3257
3254 mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size, 3258 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3255 (unsigned) orig_size, (unsigned) start); 3259 (unsigned) orig_size, (unsigned) start);
3256} 3260}
3257 3261
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3300 BUG_ON(pa->pa_free < len); 3304 BUG_ON(pa->pa_free < len);
3301 pa->pa_free -= len; 3305 pa->pa_free -= len;
3302 3306
3303 mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); 3307 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3304} 3308}
3305 3309
3306/* 3310/*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3324 * in on-disk bitmap -- see ext4_mb_release_context() 3328 * in on-disk bitmap -- see ext4_mb_release_context()
3325 * Other CPUs are prevented from allocating from this pa by lg_mutex 3329 * Other CPUs are prevented from allocating from this pa by lg_mutex
3326 */ 3330 */
3327 mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3331 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3328} 3332}
3329 3333
3330/* 3334/*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3382 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3386 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3383 continue; 3387 continue;
3384 3388
3389 /* non-extent files can't have physical blocks past 2^32 */
3390 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
3391 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3392 continue;
3393
3385 /* found preallocated blocks, use them */ 3394 /* found preallocated blocks, use them */
3386 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3387 if (pa->pa_deleted == 0 && pa->pa_free) { 3396 if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3503 preallocated += len; 3512 preallocated += len;
3504 count++; 3513 count++;
3505 } 3514 }
3506 mb_debug("prellocated %u for group %u\n", preallocated, group); 3515 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3507} 3516}
3508 3517
3509static void ext4_mb_pa_callback(struct rcu_head *head) 3518static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3638 pa->pa_deleted = 0; 3647 pa->pa_deleted = 0;
3639 pa->pa_type = MB_INODE_PA; 3648 pa->pa_type = MB_INODE_PA;
3640 3649
3641 mb_debug("new inode pa %p: %llu/%u for %u\n", pa, 3650 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3642 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3651 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3643 trace_ext4_mb_new_inode_pa(ac, pa); 3652 trace_ext4_mb_new_inode_pa(ac, pa);
3644 3653
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3698 pa->pa_deleted = 0; 3707 pa->pa_deleted = 0;
3699 pa->pa_type = MB_GROUP_PA; 3708 pa->pa_type = MB_GROUP_PA;
3700 3709
3701 mb_debug("new group pa %p: %llu/%u for %u\n", pa, 3710 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3702 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3711 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3703 trace_ext4_mb_new_group_pa(ac, pa); 3712 trace_ext4_mb_new_group_pa(ac, pa);
3704 3713
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3777 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3786 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3778 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + 3787 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
3779 le32_to_cpu(sbi->s_es->s_first_data_block); 3788 le32_to_cpu(sbi->s_es->s_first_data_block);
3780 mb_debug(" free preallocated %u/%u in group %u\n", 3789 mb_debug(1, " free preallocated %u/%u in group %u\n",
3781 (unsigned) start, (unsigned) next - bit, 3790 (unsigned) start, (unsigned) next - bit,
3782 (unsigned) group); 3791 (unsigned) group);
3783 free += next - bit; 3792 free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
3868 int busy = 0; 3877 int busy = 0;
3869 int free = 0; 3878 int free = 0;
3870 3879
3871 mb_debug("discard preallocation for group %u\n", group); 3880 mb_debug(1, "discard preallocation for group %u\n", group);
3872 3881
3873 if (list_empty(&grp->bb_prealloc_list)) 3882 if (list_empty(&grp->bb_prealloc_list))
3874 return 0; 3883 return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
3992 return; 4001 return;
3993 } 4002 }
3994 4003
3995 mb_debug("discard preallocation for inode %lu\n", inode->i_ino); 4004 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3996 trace_ext4_discard_preallocations(inode); 4005 trace_ext4_discard_preallocations(inode);
3997 4006
3998 INIT_LIST_HEAD(&list); 4007 INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
4097{ 4106{
4098 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list)); 4107 BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
4099} 4108}
4100#ifdef MB_DEBUG 4109#ifdef CONFIG_EXT4_DEBUG
4101static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 4110static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4102{ 4111{
4103 struct super_block *sb = ac->ac_sb; 4112 struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4139 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 4148 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4140 NULL, &start); 4149 NULL, &start);
4141 spin_unlock(&pa->pa_lock); 4150 spin_unlock(&pa->pa_lock);
4142 printk(KERN_ERR "PA:%lu:%d:%u \n", i, 4151 printk(KERN_ERR "PA:%u:%d:%u \n", i,
4143 start, pa->pa_len); 4152 start, pa->pa_len);
4144 } 4153 }
4145 ext4_unlock_group(sb, i); 4154 ext4_unlock_group(sb, i);
4146 4155
4147 if (grp->bb_free == 0) 4156 if (grp->bb_free == 0)
4148 continue; 4157 continue;
4149 printk(KERN_ERR "%lu: %d/%d \n", 4158 printk(KERN_ERR "%u: %d/%d \n",
4150 i, grp->bb_free, grp->bb_fragments); 4159 i, grp->bb_free, grp->bb_fragments);
4151 } 4160 }
4152 printk(KERN_ERR "\n"); 4161 printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4174 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 4183 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4175 return; 4184 return;
4176 4185
4186 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4187 return;
4188
4177 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 4189 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
4178 isize = i_size_read(ac->ac_inode) >> bsbits; 4190 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4191 >> bsbits;
4179 size = max(size, isize); 4192 size = max(size, isize);
4180 4193
4181 /* don't use group allocation for large files */ 4194 if ((size == isize) &&
4182 if (size >= sbi->s_mb_stream_request) 4195 !ext4_fs_is_busy(sbi) &&
4196 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4197 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4183 return; 4198 return;
4199 }
4184 4200
4185 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 4201 /* don't use group allocation for large files */
4202 if (size >= sbi->s_mb_stream_request) {
4203 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4186 return; 4204 return;
4205 }
4187 4206
4188 BUG_ON(ac->ac_lg != NULL); 4207 BUG_ON(ac->ac_lg != NULL);
4189 /* 4208 /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246 * locality group. this is a policy, actually */ 4265 * locality group. this is a policy, actually */
4247 ext4_mb_group_or_file(ac); 4266 ext4_mb_group_or_file(ac);
4248 4267
4249 mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4268 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4250 "left: %u/%u, right %u/%u to %swritable\n", 4269 "left: %u/%u, right %u/%u to %swritable\n",
4251 (unsigned) ar->len, (unsigned) ar->logical, 4270 (unsigned) ar->len, (unsigned) ar->logical,
4252 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4271 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4268 struct ext4_prealloc_space *pa, *tmp; 4287 struct ext4_prealloc_space *pa, *tmp;
4269 struct ext4_allocation_context *ac; 4288 struct ext4_allocation_context *ac;
4270 4289
4271 mb_debug("discard locality group preallocation\n"); 4290 mb_debug(1, "discard locality group preallocation\n");
4272 4291
4273 INIT_LIST_HEAD(&discard_list); 4292 INIT_LIST_HEAD(&discard_list);
4274 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4293 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
37 37
38/* 38/*
39 */ 39 */
40#define MB_DEBUG__ 40#ifdef CONFIG_EXT4_DEBUG
41#ifdef MB_DEBUG 41extern u8 mb_enable_debug;
42#define mb_debug(fmt, a...) printk(fmt, ##a) 42
43#define mb_debug(n, fmt, a...) \
44 do { \
45 if ((n) <= mb_enable_debug) { \
46 printk(KERN_DEBUG "(%s, %d): %s: ", \
47 __FILE__, __LINE__, __func__); \
48 printk(fmt, ## a); \
49 } \
50 } while (0)
43#else 51#else
44#define mb_debug(fmt, a...) 52#define mb_debug(n, fmt, a...)
45#endif 53#endif
46 54
47/* 55/*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
128 unsigned pa_deleted; 136 unsigned pa_deleted;
129 ext4_fsblk_t pa_pstart; /* phys. block */ 137 ext4_fsblk_t pa_pstart; /* phys. block */
130 ext4_lblk_t pa_lstart; /* log. block */ 138 ext4_lblk_t pa_lstart; /* log. block */
131 unsigned short pa_len; /* len of preallocated chunk */ 139 ext4_grpblk_t pa_len; /* len of preallocated chunk */
132 unsigned short pa_free; /* how many blocks are free */ 140 ext4_grpblk_t pa_free; /* how many blocks are free */
133 unsigned short pa_type; /* pa type. inode or group */ 141 unsigned short pa_type; /* pa type. inode or group */
134 spinlock_t *pa_obj_lock; 142 spinlock_t *pa_obj_lock;
135 struct inode *pa_inode; /* hack, for history only */ 143 struct inode *pa_inode; /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
144 ext4_lblk_t fe_logical; 152 ext4_lblk_t fe_logical;
145 ext4_grpblk_t fe_start; 153 ext4_grpblk_t fe_start;
146 ext4_group_t fe_group; 154 ext4_group_t fe_group;
147 int fe_len; 155 ext4_grpblk_t fe_len;
148}; 156};
149 157
150/* 158/*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
353 353
354 down_write(&EXT4_I(inode)->i_data_sem); 354 down_write(&EXT4_I(inode)->i_data_sem);
355 /* 355 /*
356 * if EXT4_EXT_MIGRATE is cleared a block allocation 356 * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
357 * happened after we started the migrate. We need to 357 * happened after we started the migrate. We need to
358 * fail the migrate 358 * fail the migrate
359 */ 359 */
360 if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { 360 if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
361 retval = -EAGAIN; 361 retval = -EAGAIN;
362 up_write(&EXT4_I(inode)->i_data_sem); 362 up_write(&EXT4_I(inode)->i_data_sem);
363 goto err_out; 363 goto err_out;
364 } else 364 } else
365 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & 365 EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
366 ~EXT4_EXT_MIGRATE;
367 /* 366 /*
368 * We have the extent map build with the tmp inode. 367 * We have the extent map build with the tmp inode.
369 * Now copy the i_data across 368 * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
517 * when we add extents we extent the journal 516 * when we add extents we extent the journal
518 */ 517 */
519 /* 518 /*
520 * Even though we take i_mutex we can still cause block allocation 519 * Even though we take i_mutex we can still cause block
521 * via mmap write to holes. If we have allocated new blocks we fail 520 * allocation via mmap write to holes. If we have allocated
522 * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. 521 * new blocks we fail migrate. New block allocation will
523 * The flag is updated with i_data_sem held to prevent racing with 522 * clear EXT4_STATE_EXT_MIGRATE flag. The flag is updated
524 * block allocation. 523 * with i_data_sem held to prevent racing with block
524 * allocation.
525 */ 525 */
526 down_read((&EXT4_I(inode)->i_data_sem)); 526 down_read((&EXT4_I(inode)->i_data_sem));
527 EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; 527 EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
528 up_read((&EXT4_I(inode)->i_data_sem)); 528 up_read((&EXT4_I(inode)->i_data_sem));
529 529
530 handle = ext4_journal_start(inode, 1); 530 handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
618 tmp_inode->i_nlink = 0; 618 tmp_inode->i_nlink = 0;
619 619
620 ext4_journal_stop(handle); 620 ext4_journal_stop(handle);
621 621 unlock_new_inode(tmp_inode);
622 iput(tmp_inode); 622 iput(tmp_inode);
623 623
624 return retval; 624 return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
19#include "ext4_extents.h" 19#include "ext4_extents.h"
20#include "ext4.h" 20#include "ext4.h"
21 21
22#define get_ext_path(path, inode, block, ret) \ 22/**
23 do { \ 23 * get_ext_path - Find an extent path for designated logical block number.
24 path = ext4_ext_find_extent(inode, block, path); \ 24 *
25 if (IS_ERR(path)) { \ 25 * @inode: an inode which is searched
26 ret = PTR_ERR(path); \ 26 * @lblock: logical block number to find an extent path
27 path = NULL; \ 27 * @path: pointer to an extent path pointer (for output)
28 } \ 28 *
29 } while (0) 29 * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
30 * on failure.
31 */
32static inline int
33get_ext_path(struct inode *inode, ext4_lblk_t lblock,
34 struct ext4_ext_path **path)
35{
36 int ret = 0;
37
38 *path = ext4_ext_find_extent(inode, lblock, *path);
39 if (IS_ERR(*path)) {
40 ret = PTR_ERR(*path);
41 *path = NULL;
42 } else if ((*path)[ext_depth(inode)].p_ext == NULL)
43 ret = -ENODATA;
44
45 return ret;
46}
30 47
31/** 48/**
32 * copy_extent_status - Copy the extent's initialization status 49 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
113} 130}
114 131
115/** 132/**
133 * mext_check_null_inode - NULL check for two inodes
134 *
135 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
136 */
137static int
138mext_check_null_inode(struct inode *inode1, struct inode *inode2,
139 const char *function)
140{
141 int ret = 0;
142
143 if (inode1 == NULL) {
144 ext4_error(inode2->i_sb, function,
145 "Both inodes should not be NULL: "
146 "inode1 NULL inode2 %lu", inode2->i_ino);
147 ret = -EIO;
148 } else if (inode2 == NULL) {
149 ext4_error(inode1->i_sb, function,
150 "Both inodes should not be NULL: "
151 "inode1 %lu inode2 NULL", inode1->i_ino);
152 ret = -EIO;
153 }
154 return ret;
155}
156
157/**
116 * mext_double_down_read - Acquire two inodes' read semaphore 158 * mext_double_down_read - Acquire two inodes' read semaphore
117 * 159 *
118 * @orig_inode: original inode structure 160 * @orig_inode: original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
124{ 166{
125 struct inode *first = orig_inode, *second = donor_inode; 167 struct inode *first = orig_inode, *second = donor_inode;
126 168
127 BUG_ON(orig_inode == NULL || donor_inode == NULL);
128
129 /* 169 /*
130 * Use the inode number to provide the stable locking order instead 170 * Use the inode number to provide the stable locking order instead
131 * of its address, because the C language doesn't guarantee you can 171 * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
152{ 192{
153 struct inode *first = orig_inode, *second = donor_inode; 193 struct inode *first = orig_inode, *second = donor_inode;
154 194
155 BUG_ON(orig_inode == NULL || donor_inode == NULL);
156
157 /* 195 /*
158 * Use the inode number to provide the stable locking order instead 196 * Use the inode number to provide the stable locking order instead
159 * of its address, because the C language doesn't guarantee you can 197 * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
178static void 216static void
179mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
180{ 218{
181 BUG_ON(orig_inode == NULL || donor_inode == NULL);
182
183 up_read(&EXT4_I(orig_inode)->i_data_sem); 219 up_read(&EXT4_I(orig_inode)->i_data_sem);
184 up_read(&EXT4_I(donor_inode)->i_data_sem); 220 up_read(&EXT4_I(donor_inode)->i_data_sem);
185} 221}
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
194static void 230static void
195mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode) 231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
196{ 232{
197 BUG_ON(orig_inode == NULL || donor_inode == NULL);
198
199 up_write(&EXT4_I(orig_inode)->i_data_sem); 233 up_write(&EXT4_I(orig_inode)->i_data_sem);
200 up_write(&EXT4_I(donor_inode)->i_data_sem); 234 up_write(&EXT4_I(donor_inode)->i_data_sem);
201} 235}
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
283 } 317 }
284 318
285 if (new_flag) { 319 if (new_flag) {
286 get_ext_path(orig_path, orig_inode, eblock, err); 320 err = get_ext_path(orig_inode, eblock, &orig_path);
287 if (orig_path == NULL) 321 if (err)
288 goto out; 322 goto out;
289 323
290 if (ext4_ext_insert_extent(handle, orig_inode, 324 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
293 } 327 }
294 328
295 if (end_flag) { 329 if (end_flag) {
296 get_ext_path(orig_path, orig_inode, 330 err = get_ext_path(orig_inode,
297 le32_to_cpu(end_ext->ee_block) - 1, err); 331 le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
298 if (orig_path == NULL) 332 if (err)
299 goto out; 333 goto out;
300 334
301 if (ext4_ext_insert_extent(handle, orig_inode, 335 if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
519 * oext |-----------| 553 * oext |-----------|
520 * new_ext |-------| 554 * new_ext |-------|
521 */ 555 */
522 BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end); 556 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
557 ext4_error(orig_inode->i_sb, __func__,
558 "new_ext_end(%u) should be less than or equal to "
559 "oext->ee_block(%u) + oext_alen(%d) - 1",
560 new_ext_end, le32_to_cpu(oext->ee_block),
561 oext_alen);
562 ret = -EIO;
563 goto out;
564 }
523 565
524 /* 566 /*
525 * Case: new_ext is smaller than original extent 567 * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
543 585
544 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, 586 ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
545 o_end, &start_ext, &new_ext, &end_ext); 587 o_end, &start_ext, &new_ext, &end_ext);
588out:
546 return ret; 589 return ret;
547} 590}
548 591
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
554 * @orig_off: block offset of original inode 597 * @orig_off: block offset of original inode
555 * @donor_off: block offset of donor inode 598 * @donor_off: block offset of donor inode
556 * @max_count: the maximun length of extents 599 * @max_count: the maximun length of extents
600 *
601 * Return 0 on success, or a negative error value on failure.
557 */ 602 */
558static void 603static int
559mext_calc_swap_extents(struct ext4_extent *tmp_dext, 604mext_calc_swap_extents(struct ext4_extent *tmp_dext,
560 struct ext4_extent *tmp_oext, 605 struct ext4_extent *tmp_oext,
561 ext4_lblk_t orig_off, ext4_lblk_t donor_off, 606 ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
564 ext4_lblk_t diff, orig_diff; 609 ext4_lblk_t diff, orig_diff;
565 struct ext4_extent dext_old, oext_old; 610 struct ext4_extent dext_old, oext_old;
566 611
612 BUG_ON(orig_off != donor_off);
613
614 /* original and donor extents have to cover the same block offset */
615 if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
616 le32_to_cpu(tmp_oext->ee_block) +
617 ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
618 return -ENODATA;
619
620 if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
621 le32_to_cpu(tmp_dext->ee_block) +
622 ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
623 return -ENODATA;
624
567 dext_old = *tmp_dext; 625 dext_old = *tmp_dext;
568 oext_old = *tmp_oext; 626 oext_old = *tmp_oext;
569 627
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
591 649
592 copy_extent_status(&oext_old, tmp_dext); 650 copy_extent_status(&oext_old, tmp_dext);
593 copy_extent_status(&dext_old, tmp_oext); 651 copy_extent_status(&dext_old, tmp_oext);
652
653 return 0;
594} 654}
595 655
596/** 656/**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
631 mext_double_down_write(orig_inode, donor_inode); 691 mext_double_down_write(orig_inode, donor_inode);
632 692
633 /* Get the original extent for the block "orig_off" */ 693 /* Get the original extent for the block "orig_off" */
634 get_ext_path(orig_path, orig_inode, orig_off, err); 694 err = get_ext_path(orig_inode, orig_off, &orig_path);
635 if (orig_path == NULL) 695 if (err)
636 goto out; 696 goto out;
637 697
638 /* Get the donor extent for the head */ 698 /* Get the donor extent for the head */
639 get_ext_path(donor_path, donor_inode, donor_off, err); 699 err = get_ext_path(donor_inode, donor_off, &donor_path);
640 if (donor_path == NULL) 700 if (err)
641 goto out; 701 goto out;
642 depth = ext_depth(orig_inode); 702 depth = ext_depth(orig_inode);
643 oext = orig_path[depth].p_ext; 703 oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
647 dext = donor_path[depth].p_ext; 707 dext = donor_path[depth].p_ext;
648 tmp_dext = *dext; 708 tmp_dext = *dext;
649 709
650 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
651 donor_off, count); 711 donor_off, count);
712 if (err)
713 goto out;
652 714
653 /* Loop for the donor extents */ 715 /* Loop for the donor extents */
654 while (1) { 716 while (1) {
655 /* The extent for donor must be found. */ 717 /* The extent for donor must be found. */
656 BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block)); 718 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found");
721 err = -EIO;
722 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__,
725 "Donor offset(%u) and the first block of donor "
726 "extent(%u) should be equal",
727 donor_off,
728 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO;
730 goto out;
731 }
657 732
658 /* Set donor extent to orig extent */ 733 /* Set donor extent to orig extent */
659 err = mext_leaf_block(handle, orig_inode, 734 err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 753
679 if (orig_path) 754 if (orig_path)
680 ext4_ext_drop_refs(orig_path); 755 ext4_ext_drop_refs(orig_path);
681 get_ext_path(orig_path, orig_inode, orig_off, err); 756 err = get_ext_path(orig_inode, orig_off, &orig_path);
682 if (orig_path == NULL) 757 if (err)
683 goto out; 758 goto out;
684 depth = ext_depth(orig_inode); 759 depth = ext_depth(orig_inode);
685 oext = orig_path[depth].p_ext; 760 oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 767
693 if (donor_path) 768 if (donor_path)
694 ext4_ext_drop_refs(donor_path); 769 ext4_ext_drop_refs(donor_path);
695 get_ext_path(donor_path, donor_inode, 770 err = get_ext_path(donor_inode, donor_off, &donor_path);
696 donor_off, err); 771 if (err)
697 if (donor_path == NULL)
698 goto out; 772 goto out;
699 depth = ext_depth(donor_inode); 773 depth = ext_depth(donor_inode);
700 dext = donor_path[depth].p_ext; 774 dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
705 } 779 }
706 tmp_dext = *dext; 780 tmp_dext = *dext;
707 781
708 mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
709 donor_off, 783 donor_off, count - replaced_count);
710 count - replaced_count); 784 if (err)
785 goto out;
711 } 786 }
712 787
713out: 788out:
@@ -740,7 +815,7 @@ out:
740 * on success, or a negative error value on failure. 815 * on success, or a negative error value on failure.
741 */ 816 */
742static int 817static int
743move_extent_par_page(struct file *o_filp, struct inode *donor_inode, 818move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
744 pgoff_t orig_page_offset, int data_offset_in_page, 819 pgoff_t orig_page_offset, int data_offset_in_page,
745 int block_len_in_page, int uninit) 820 int block_len_in_page, int uninit)
746{ 821{
@@ -871,6 +946,7 @@ out:
871 if (PageLocked(page)) 946 if (PageLocked(page))
872 unlock_page(page); 947 unlock_page(page);
873 page_cache_release(page); 948 page_cache_release(page);
949 ext4_journal_stop(handle);
874 } 950 }
875out2: 951out2:
876 ext4_journal_stop(handle); 952 ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
897 struct inode *donor_inode, __u64 orig_start, 973 struct inode *donor_inode, __u64 orig_start,
898 __u64 donor_start, __u64 *len, __u64 moved_len) 974 __u64 donor_start, __u64 *len, __u64 moved_len)
899{ 975{
976 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits;
978 unsigned int blocksize = 1 << blkbits;
979
900 /* Regular file check */ 980 /* Regular file check */
901 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 981 if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
902 ext4_debug("ext4 move extent: The argument files should be " 982 ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
960 return -EINVAL; 1040 return -EINVAL;
961 } 1041 }
962 1042
963 if ((orig_start > MAX_DEFRAG_SIZE) || 1043 if ((orig_start > EXT_MAX_BLOCK) ||
964 (donor_start > MAX_DEFRAG_SIZE) || 1044 (donor_start > EXT_MAX_BLOCK) ||
965 (*len > MAX_DEFRAG_SIZE) || 1045 (*len > EXT_MAX_BLOCK) ||
966 (orig_start + *len > MAX_DEFRAG_SIZE)) { 1046 (orig_start + *len > EXT_MAX_BLOCK)) {
967 ext4_debug("ext4 move extent: Can't handle over [%lu] blocks " 1047 ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
968 "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE, 1048 "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
969 orig_inode->i_ino, donor_inode->i_ino); 1049 orig_inode->i_ino, donor_inode->i_ino);
970 return -EINVAL; 1050 return -EINVAL;
971 } 1051 }
972 1052
973 if (orig_inode->i_size > donor_inode->i_size) { 1053 if (orig_inode->i_size > donor_inode->i_size) {
974 if (orig_start >= donor_inode->i_size) { 1054 donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
1055 /* TODO: eliminate this artificial restriction */
1056 if (orig_start >= donor_blocks) {
975 ext4_debug("ext4 move extent: orig start offset " 1057 ext4_debug("ext4 move extent: orig start offset "
976 "[%llu] should be less than donor file size " 1058 "[%llu] should be less than donor file blocks "
977 "[%lld] [ino:orig %lu, donor_inode %lu]\n", 1059 "[%u] [ino:orig %lu, donor %lu]\n",
978 orig_start, donor_inode->i_size, 1060 orig_start, donor_blocks,
979 orig_inode->i_ino, donor_inode->i_ino); 1061 orig_inode->i_ino, donor_inode->i_ino);
980 return -EINVAL; 1062 return -EINVAL;
981 } 1063 }
982 1064
983 if (orig_start + *len > donor_inode->i_size) { 1065 /* TODO: eliminate this artificial restriction */
1066 if (orig_start + *len > donor_blocks) {
984 ext4_debug("ext4 move extent: End offset [%llu] should " 1067 ext4_debug("ext4 move extent: End offset [%llu] should "
985 "be less than donor file size [%lld]." 1068 "be less than donor file blocks [%u]."
986 "So adjust length from %llu to %lld " 1069 "So adjust length from %llu to %llu "
987 "[ino:orig %lu, donor %lu]\n", 1070 "[ino:orig %lu, donor %lu]\n",
988 orig_start + *len, donor_inode->i_size, 1071 orig_start + *len, donor_blocks,
989 *len, donor_inode->i_size - orig_start, 1072 *len, donor_blocks - orig_start,
990 orig_inode->i_ino, donor_inode->i_ino); 1073 orig_inode->i_ino, donor_inode->i_ino);
991 *len = donor_inode->i_size - orig_start; 1074 *len = donor_blocks - orig_start;
992 } 1075 }
993 } else { 1076 } else {
994 if (orig_start >= orig_inode->i_size) { 1077 orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
1078 if (orig_start >= orig_blocks) {
995 ext4_debug("ext4 move extent: start offset [%llu] " 1079 ext4_debug("ext4 move extent: start offset [%llu] "
996 "should be less than original file size " 1080 "should be less than original file blocks "
997 "[%lld] [inode:orig %lu, donor %lu]\n", 1081 "[%u] [ino:orig %lu, donor %lu]\n",
998 orig_start, orig_inode->i_size, 1082 orig_start, orig_blocks,
999 orig_inode->i_ino, donor_inode->i_ino); 1083 orig_inode->i_ino, donor_inode->i_ino);
1000 return -EINVAL; 1084 return -EINVAL;
1001 } 1085 }
1002 1086
1003 if (orig_start + *len > orig_inode->i_size) { 1087 if (orig_start + *len > orig_blocks) {
1004 ext4_debug("ext4 move extent: Adjust length " 1088 ext4_debug("ext4 move extent: Adjust length "
1005 "from %llu to %lld. Because it should be " 1089 "from %llu to %llu. Because it should be "
1006 "less than original file size " 1090 "less than original file blocks "
1007 "[ino:orig %lu, donor %lu]\n", 1091 "[ino:orig %lu, donor %lu]\n",
1008 *len, orig_inode->i_size - orig_start, 1092 *len, orig_blocks - orig_start,
1009 orig_inode->i_ino, donor_inode->i_ino); 1093 orig_inode->i_ino, donor_inode->i_ino);
1010 *len = orig_inode->i_size - orig_start; 1094 *len = orig_blocks - orig_start;
1011 } 1095 }
1012 } 1096 }
1013 1097
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
1027 * @inode1: the inode structure 1111 * @inode1: the inode structure
1028 * @inode2: the inode structure 1112 * @inode2: the inode structure
1029 * 1113 *
1030 * Lock two inodes' i_mutex by i_ino order. This function is moved from 1114 * Lock two inodes' i_mutex by i_ino order.
1031 * fs/inode.c. 1115 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1032 */ 1116 */
1033static void 1117static int
1034mext_inode_double_lock(struct inode *inode1, struct inode *inode2) 1118mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1035{ 1119{
1036 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1120 int ret = 0;
1037 if (inode1) 1121
1038 mutex_lock(&inode1->i_mutex); 1122 BUG_ON(inode1 == NULL && inode2 == NULL);
1039 else if (inode2) 1123
1040 mutex_lock(&inode2->i_mutex); 1124 ret = mext_check_null_inode(inode1, inode2, __func__);
1041 return; 1125 if (ret < 0)
1126 goto out;
1127
1128 if (inode1 == inode2) {
1129 mutex_lock(&inode1->i_mutex);
1130 goto out;
1042 } 1131 }
1043 1132
1044 if (inode1->i_ino < inode2->i_ino) { 1133 if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1048 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1137 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
1049 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1138 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
1050 } 1139 }
1140
1141out:
1142 return ret;
1051} 1143}
1052 1144
1053/** 1145/**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
1056 * @inode1: the inode that is released first 1148 * @inode1: the inode that is released first
1057 * @inode2: the inode that is released second 1149 * @inode2: the inode that is released second
1058 * 1150 *
1059 * This function is moved from fs/inode.c. 1151 * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
1060 */ 1152 */
1061 1153
1062static void 1154static int
1063mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) 1155mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
1064{ 1156{
1157 int ret = 0;
1158
1159 BUG_ON(inode1 == NULL && inode2 == NULL);
1160
1161 ret = mext_check_null_inode(inode1, inode2, __func__);
1162 if (ret < 0)
1163 goto out;
1164
1065 if (inode1) 1165 if (inode1)
1066 mutex_unlock(&inode1->i_mutex); 1166 mutex_unlock(&inode1->i_mutex);
1067 1167
1068 if (inode2 && inode2 != inode1) 1168 if (inode2 && inode2 != inode1)
1069 mutex_unlock(&inode2->i_mutex); 1169 mutex_unlock(&inode2->i_mutex);
1170
1171out:
1172 return ret;
1070} 1173}
1071 1174
1072/** 1175/**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1123 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; 1226 ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
1124 ext4_lblk_t rest_blocks; 1227 ext4_lblk_t rest_blocks;
1125 pgoff_t orig_page_offset = 0, seq_end_page; 1228 pgoff_t orig_page_offset = 0, seq_end_page;
1126 int ret, depth, last_extent = 0; 1229 int ret1, ret2, depth, last_extent = 0;
1127 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 1230 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
1128 int data_offset_in_page; 1231 int data_offset_in_page;
1129 int block_len_in_page; 1232 int block_len_in_page;
1130 int uninit; 1233 int uninit;
1131 1234
1132 /* protect orig and donor against a truncate */ 1235 /* protect orig and donor against a truncate */
1133 mext_inode_double_lock(orig_inode, donor_inode); 1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0)
1238 return ret1;
1134 1239
1135 mext_double_down_read(orig_inode, donor_inode); 1240 mext_double_down_read(orig_inode, donor_inode);
1136 /* Check the filesystem environment whether move_extent can be done */ 1241 /* Check the filesystem environment whether move_extent can be done */
1137 ret = mext_check_arguments(orig_inode, donor_inode, orig_start, 1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1138 donor_start, &len, *moved_len); 1243 donor_start, &len, *moved_len);
1139 mext_double_up_read(orig_inode, donor_inode); 1244 mext_double_up_read(orig_inode, donor_inode);
1140 if (ret) 1245 if (ret1)
1141 goto out2; 1246 goto out;
1142 1247
1143 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; 1248 file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
1144 block_end = block_start + len - 1; 1249 block_end = block_start + len - 1;
1145 if (file_end < block_end) 1250 if (file_end < block_end)
1146 len -= block_end - file_end; 1251 len -= block_end - file_end;
1147 1252
1148 get_ext_path(orig_path, orig_inode, block_start, ret); 1253 ret1 = get_ext_path(orig_inode, block_start, &orig_path);
1149 if (orig_path == NULL) 1254 if (ret1)
1150 goto out2; 1255 goto out;
1151 1256
1152 /* Get path structure to check the hole */ 1257 /* Get path structure to check the hole */
1153 get_ext_path(holecheck_path, orig_inode, block_start, ret); 1258 ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
1154 if (holecheck_path == NULL) 1259 if (ret1)
1155 goto out; 1260 goto out;
1156 1261
1157 depth = ext_depth(orig_inode); 1262 depth = ext_depth(orig_inode);
1158 ext_cur = holecheck_path[depth].p_ext; 1263 ext_cur = holecheck_path[depth].p_ext;
1159 if (ext_cur == NULL) {
1160 ret = -EINVAL;
1161 goto out;
1162 }
1163 1264
1164 /* 1265 /*
1165 * Get proper extent whose ee_block is beyond block_start 1266 * Get proper starting location of block replacement if block_start was
1166 * if block_start was within the hole. 1267 * within the hole.
1167 */ 1268 */
1168 if (le32_to_cpu(ext_cur->ee_block) + 1269 if (le32_to_cpu(ext_cur->ee_block) +
1169 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { 1270 ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
1271 /*
1272 * The hole exists between extents or the tail of
1273 * original file.
1274 */
1170 last_extent = mext_next_extent(orig_inode, 1275 last_extent = mext_next_extent(orig_inode,
1171 holecheck_path, &ext_cur); 1276 holecheck_path, &ext_cur);
1172 if (last_extent < 0) { 1277 if (last_extent < 0) {
1173 ret = last_extent; 1278 ret1 = last_extent;
1174 goto out; 1279 goto out;
1175 } 1280 }
1176 last_extent = mext_next_extent(orig_inode, orig_path, 1281 last_extent = mext_next_extent(orig_inode, orig_path,
1177 &ext_dummy); 1282 &ext_dummy);
1178 if (last_extent < 0) { 1283 if (last_extent < 0) {
1179 ret = last_extent; 1284 ret1 = last_extent;
1180 goto out; 1285 goto out;
1181 } 1286 }
1182 } 1287 seq_start = le32_to_cpu(ext_cur->ee_block);
1183 seq_start = block_start; 1288 } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
1289 /* The hole exists at the beginning of original file. */
1290 seq_start = le32_to_cpu(ext_cur->ee_block);
1291 else
1292 seq_start = block_start;
1184 1293
1185 /* No blocks within the specified range. */ 1294 /* No blocks within the specified range. */
1186 if (le32_to_cpu(ext_cur->ee_block) > block_end) { 1295 if (le32_to_cpu(ext_cur->ee_block) > block_end) {
1187 ext4_debug("ext4 move extent: The specified range of file " 1296 ext4_debug("ext4 move extent: The specified range of file "
1188 "may be the hole\n"); 1297 "may be the hole\n");
1189 ret = -EINVAL; 1298 ret1 = -EINVAL;
1190 goto out; 1299 goto out;
1191 } 1300 }
1192 1301
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1206 last_extent = mext_next_extent(orig_inode, holecheck_path, 1315 last_extent = mext_next_extent(orig_inode, holecheck_path,
1207 &ext_cur); 1316 &ext_cur);
1208 if (last_extent < 0) { 1317 if (last_extent < 0) {
1209 ret = last_extent; 1318 ret1 = last_extent;
1210 break; 1319 break;
1211 } 1320 }
1212 add_blocks = ext4_ext_get_actual_len(ext_cur); 1321 add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1258 while (orig_page_offset <= seq_end_page) { 1367 while (orig_page_offset <= seq_end_page) {
1259 1368
1260 /* Swap original branches with new branches */ 1369 /* Swap original branches with new branches */
1261 ret = move_extent_par_page(o_filp, donor_inode, 1370 ret1 = move_extent_per_page(o_filp, donor_inode,
1262 orig_page_offset, 1371 orig_page_offset,
1263 data_offset_in_page, 1372 data_offset_in_page,
1264 block_len_in_page, uninit); 1373 block_len_in_page, uninit);
1265 if (ret < 0) 1374 if (ret1 < 0)
1266 goto out; 1375 goto out;
1267 orig_page_offset++; 1376 orig_page_offset++;
1268 /* Count how many blocks we have exchanged */ 1377 /* Count how many blocks we have exchanged */
1269 *moved_len += block_len_in_page; 1378 *moved_len += block_len_in_page;
1270 BUG_ON(*moved_len > len); 1379 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len);
1384 ret1 = -EIO;
1385 goto out;
1386 }
1271 1387
1272 data_offset_in_page = 0; 1388 data_offset_in_page = 0;
1273 rest_blocks -= block_len_in_page; 1389 rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1280 /* Decrease buffer counter */ 1396 /* Decrease buffer counter */
1281 if (holecheck_path) 1397 if (holecheck_path)
1282 ext4_ext_drop_refs(holecheck_path); 1398 ext4_ext_drop_refs(holecheck_path);
1283 get_ext_path(holecheck_path, orig_inode, 1399 ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
1284 seq_start, ret); 1400 if (ret1)
1285 if (holecheck_path == NULL)
1286 break; 1401 break;
1287 depth = holecheck_path->p_depth; 1402 depth = holecheck_path->p_depth;
1288 1403
1289 /* Decrease buffer counter */ 1404 /* Decrease buffer counter */
1290 if (orig_path) 1405 if (orig_path)
1291 ext4_ext_drop_refs(orig_path); 1406 ext4_ext_drop_refs(orig_path);
1292 get_ext_path(orig_path, orig_inode, seq_start, ret); 1407 ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
1293 if (orig_path == NULL) 1408 if (ret1)
1294 break; 1409 break;
1295 1410
1296 ext_cur = holecheck_path[depth].p_ext; 1411 ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
1307 ext4_ext_drop_refs(holecheck_path); 1422 ext4_ext_drop_refs(holecheck_path);
1308 kfree(holecheck_path); 1423 kfree(holecheck_path);
1309 } 1424 }
1310out2:
1311 mext_inode_double_unlock(orig_inode, donor_inode);
1312 1425
1313 if (ret) 1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1314 return ret;
1315 1427
1316 /* All of the specified blocks must be exchanged in succeed */ 1428 if (ret1)
1317 BUG_ON(*moved_len != len); 1429 return ret1;
1430 else if (ret2)
1431 return ret2;
1318 1432
1319 return 0; 1433 return 0;
1320} 1434}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1518 return retval; 1518 return retval;
1519 1519
1520 if (blocks == 1 && !dx_fallback && 1520 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1522 return make_indexed_dir(handle, dentry, inode, bh); 1522 retval = make_indexed_dir(handle, dentry, inode, bh);
1523 if (retval == -ENOSPC)
1524 brelse(bh);
1525 return retval;
1526 }
1523 brelse(bh); 1527 brelse(bh);
1524 } 1528 }
1525 bh = ext4_append(handle, dir, &block, &retval); 1529 bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1532 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1533 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1534 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1535 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1536 if (retval == -ENOSPC)
1537 brelse(bh);
1538 return retval;
1532} 1539}
1533 1540
1534/* 1541/*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1590 goto cleanup; 1597 goto cleanup;
1591 node2 = (struct dx_node *)(bh2->b_data); 1598 node2 = (struct dx_node *)(bh2->b_data);
1592 entries2 = node2->entries; 1599 entries2 = node2->entries;
1600 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1593 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, 1601 node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
1594 sb->s_blocksize); 1602 sb->s_blocksize);
1595 node2->fake.inode = 0;
1596 BUFFER_TRACE(frame->bh, "get_write_access"); 1603 BUFFER_TRACE(frame->bh, "get_write_access");
1597 err = ext4_journal_get_write_access(handle, frame->bh); 1604 err = ext4_journal_get_write_access(handle, frame->bh);
1598 if (err) 1605 if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1664 if (!de)
1658 goto cleanup; 1665 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1666 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL; 1667 if (err != -ENOSPC)
1668 bh = NULL;
1661 goto cleanup; 1669 goto cleanup;
1662 1670
1663journal_error: 1671journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
2310 struct inode *inode = old_dentry->d_inode; 2318 struct inode *inode = old_dentry->d_inode;
2311 int err, retries = 0; 2319 int err, retries = 0;
2312 2320
2313 if (EXT4_DIR_LINK_MAX(inode)) 2321 if (inode->i_nlink >= EXT4_LINK_MAX)
2314 return -EMLINK; 2322 return -EMLINK;
2315 2323
2316 /* 2324 /*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2413 goto end_rename; 2421 goto end_rename;
2414 retval = -EMLINK; 2422 retval = -EMLINK;
2415 if (!new_inode && new_dir != old_dir && 2423 if (!new_inode && new_dir != old_dir &&
2416 new_dir->i_nlink >= EXT4_LINK_MAX) 2424 EXT4_DIR_LINK_MAX(new_dir))
2417 goto end_rename; 2425 goto end_rename;
2418 } 2426 }
2419 if (!new_bh) { 2427 if (!new_bh) {
@@ -2536,7 +2544,7 @@ const struct inode_operations ext4_dir_inode_operations = {
2536 .listxattr = ext4_listxattr, 2544 .listxattr = ext4_listxattr,
2537 .removexattr = generic_removexattr, 2545 .removexattr = generic_removexattr,
2538#endif 2546#endif
2539 .permission = ext4_permission, 2547 .check_acl = ext4_check_acl,
2540 .fiemap = ext4_fiemap, 2548 .fiemap = ext4_fiemap,
2541}; 2549};
2542 2550
@@ -2548,5 +2556,5 @@ const struct inode_operations ext4_special_inode_operations = {
2548 .listxattr = ext4_listxattr, 2556 .listxattr = ext4_listxattr,
2549 .removexattr = generic_removexattr, 2557 .removexattr = generic_removexattr,
2550#endif 2558#endif
2551 .permission = ext4_permission, 2559 .check_acl = ext4_check_acl,
2552}; 2560};
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
746 struct inode *inode = NULL; 746 struct inode *inode = NULL;
747 handle_t *handle; 747 handle_t *handle;
748 int gdb_off, gdb_num; 748 int gdb_off, gdb_num;
749 int num_grp_locked = 0;
750 int err, err2; 749 int err, err2;
751 750
752 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 751 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
856 * using the new disk blocks. 855 * using the new disk blocks.
857 */ 856 */
858 857
859 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
860 /* Update group descriptor block for new group */ 858 /* Update group descriptor block for new group */
861 gdp = (struct ext4_group_desc *)((char *)primary->b_data + 859 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
862 gdb_off * EXT4_DESC_SIZE(sb)); 860 gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
875 * descriptor 873 * descriptor
876 */ 874 */
877 err = ext4_mb_add_groupinfo(sb, input->group, gdp); 875 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
878 if (err) { 876 if (err)
879 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
880 goto exit_journal; 877 goto exit_journal;
881 }
882 878
883 /* 879 /*
884 * Make the new blocks and inodes valid next. We do this before 880 * Make the new blocks and inodes valid next. We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
920 916
921 /* Update the global fs size fields */ 917 /* Update the global fs size fields */
922 sbi->s_groups_count++; 918 sbi->s_groups_count++;
923 ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
924 919
925 ext4_handle_dirty_metadata(handle, NULL, primary); 920 ext4_handle_dirty_metadata(handle, NULL, primary);
926 921
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
45#include "ext4_jbd2.h" 45#include "ext4_jbd2.h"
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include "mballoc.h"
48 49
49#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
50#include <trace/events/ext4.h> 51#include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
344 errstr = "Out of memory"; 345 errstr = "Out of memory";
345 break; 346 break;
346 case -EROFS: 347 case -EROFS:
347 if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT) 348 if (!sb || (EXT4_SB(sb)->s_journal &&
349 EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
348 errstr = "Journal has aborted"; 350 errstr = "Journal has aborted";
349 else 351 else
350 errstr = "Readonly filesystem"; 352 errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
1279 *journal_devnum = option; 1281 *journal_devnum = option;
1280 break; 1282 break;
1281 case Opt_journal_checksum: 1283 case Opt_journal_checksum:
1282 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); 1284 break; /* Kept for backwards compatibility */
1283 break;
1284 case Opt_journal_async_commit: 1285 case Opt_journal_async_commit:
1285 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT); 1286 set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
1286 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
1287 break; 1287 break;
1288 case Opt_noload: 1288 case Opt_noload:
1289 set_opt(sbi->s_mount_opt, NOLOAD); 1289 set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
1695 gdp = ext4_get_group_desc(sb, i, NULL); 1695 gdp = ext4_get_group_desc(sb, i, NULL);
1696 1696
1697 flex_group = ext4_flex_group(sbi, i); 1697 flex_group = ext4_flex_group(sbi, i);
1698 atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, 1698 atomic_add(ext4_free_inodes_count(sb, gdp),
1699 ext4_free_inodes_count(sb, gdp)); 1699 &sbi->s_flex_groups[flex_group].free_inodes);
1700 atomic_set(&sbi->s_flex_groups[flex_group].free_blocks, 1700 atomic_add(ext4_free_blks_count(sb, gdp),
1701 ext4_free_blks_count(sb, gdp)); 1701 &sbi->s_flex_groups[flex_group].free_blocks);
1702 atomic_set(&sbi->s_flex_groups[flex_group].used_dirs, 1702 atomic_add(ext4_used_dirs_count(sb, gdp),
1703 ext4_used_dirs_count(sb, gdp)); 1703 &sbi->s_flex_groups[flex_group].used_dirs);
1704 } 1704 }
1705 1705
1706 return 1; 1706 return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
2253 .release = ext4_sb_release, 2253 .release = ext4_sb_release,
2254}; 2254};
2255 2255
2256/*
2257 * Check whether this filesystem can be mounted based on
2258 * the features present and the RDONLY/RDWR mount requested.
2259 * Returns 1 if this filesystem can be mounted as requested,
2260 * 0 if it cannot be.
2261 */
2262static int ext4_feature_set_ok(struct super_block *sb, int readonly)
2263{
2264 if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
2265 ext4_msg(sb, KERN_ERR,
2266 "Couldn't mount because of "
2267 "unsupported optional features (%x)",
2268 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2269 ~EXT4_FEATURE_INCOMPAT_SUPP));
2270 return 0;
2271 }
2272
2273 if (readonly)
2274 return 1;
2275
2276 /* Check that feature set is OK for a read-write mount */
2277 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
2278 ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
2279 "unsupported optional features (%x)",
2280 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2281 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2282 return 0;
2283 }
2284 /*
2285 * Large file size enabled file system can only be mounted
2286 * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
2287 */
2288 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
2289 if (sizeof(blkcnt_t) < sizeof(u64)) {
2290 ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
2291 "cannot be mounted RDWR without "
2292 "CONFIG_LBDAF");
2293 return 0;
2294 }
2295 }
2296 return 1;
2297}
2298
2256static int ext4_fill_super(struct super_block *sb, void *data, int silent) 2299static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2257 __releases(kernel_lock) 2300 __releases(kernel_lock)
2258 __acquires(kernel_lock) 2301 __acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2274 unsigned int db_count; 2317 unsigned int db_count;
2275 unsigned int i; 2318 unsigned int i;
2276 int needs_recovery, has_huge_files; 2319 int needs_recovery, has_huge_files;
2277 int features;
2278 __u64 blocks_count; 2320 __u64 blocks_count;
2279 int err; 2321 int err;
2280 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 2322 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2401 * previously didn't change the revision level when setting the flags, 2443 * previously didn't change the revision level when setting the flags,
2402 * so there is a chance incompat flags are set on a rev 0 filesystem. 2444 * so there is a chance incompat flags are set on a rev 0 filesystem.
2403 */ 2445 */
2404 features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); 2446 if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
2405 if (features) {
2406 ext4_msg(sb, KERN_ERR,
2407 "Couldn't mount because of "
2408 "unsupported optional features (%x)",
2409 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
2410 ~EXT4_FEATURE_INCOMPAT_SUPP));
2411 goto failed_mount;
2412 }
2413 features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
2414 if (!(sb->s_flags & MS_RDONLY) && features) {
2415 ext4_msg(sb, KERN_ERR,
2416 "Couldn't mount RDWR because of "
2417 "unsupported optional features (%x)",
2418 (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
2419 ~EXT4_FEATURE_RO_COMPAT_SUPP));
2420 goto failed_mount; 2447 goto failed_mount;
2421 } 2448
2422 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2423 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2424 if (has_huge_files) {
2425 /*
2426 * Large file size enabled file system can only be
2427 * mount if kernel is build with CONFIG_LBDAF
2428 */
2429 if (sizeof(root->i_blocks) < sizeof(u64) &&
2430 !(sb->s_flags & MS_RDONLY)) {
2431 ext4_msg(sb, KERN_ERR, "Filesystem with huge "
2432 "files cannot be mounted read-write "
2433 "without CONFIG_LBDAF");
2434 goto failed_mount;
2435 }
2436 }
2437 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size); 2449 blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
2438 2450
2439 if (blocksize < EXT4_MIN_BLOCK_SIZE || 2451 if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2469 } 2481 }
2470 } 2482 }
2471 2483
2484 has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
2485 EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
2472 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, 2486 sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
2473 has_huge_files); 2487 has_huge_files);
2474 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); 2488 sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2549 goto failed_mount; 2563 goto failed_mount;
2550 } 2564 }
2551 2565
2552 if (ext4_blocks_count(es) > 2566 /*
2553 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) { 2567 * Test whether we have more sectors than will fit in sector_t,
2568 * and whether the max offset is addressable by the page cache.
2569 */
2570 if ((ext4_blocks_count(es) >
2571 (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
2572 (ext4_blocks_count(es) >
2573 (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
2554 ext4_msg(sb, KERN_ERR, "filesystem" 2574 ext4_msg(sb, KERN_ERR, "filesystem"
2555 " too large to mount safely"); 2575 " too large to mount safely on this system");
2556 if (sizeof(sector_t) < 8) 2576 if (sizeof(sector_t) < 8)
2557 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled"); 2577 ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
2578 ret = -EFBIG;
2558 goto failed_mount; 2579 goto failed_mount;
2559 } 2580 }
2560 2581
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2595 goto failed_mount; 2616 goto failed_mount;
2596 } 2617 }
2597 sbi->s_groups_count = blocks_count; 2618 sbi->s_groups_count = blocks_count;
2619 sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
2620 (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
2598 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / 2621 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
2599 EXT4_DESC_PER_BLOCK(sb); 2622 EXT4_DESC_PER_BLOCK(sb);
2600 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), 2623 sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2729 goto failed_mount4; 2752 goto failed_mount4;
2730 } 2753 }
2731 2754
2732 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 2755 jbd2_journal_set_features(sbi->s_journal,
2733 jbd2_journal_set_features(sbi->s_journal, 2756 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2734 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 2757 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
2758 jbd2_journal_set_features(sbi->s_journal, 0, 0,
2735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2759 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2736 } else if (test_opt(sb, JOURNAL_CHECKSUM)) { 2760 else
2737 jbd2_journal_set_features(sbi->s_journal,
2738 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
2739 jbd2_journal_clear_features(sbi->s_journal, 0, 0, 2761 jbd2_journal_clear_features(sbi->s_journal, 0, 0,
2740 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); 2762 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2741 } else {
2742 jbd2_journal_clear_features(sbi->s_journal,
2743 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
2744 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
2745 }
2746 2763
2747 /* We have now updated the journal if required, so we can 2764 /* We have now updated the journal if required, so we can
2748 * validate the data journaling mode. */ 2765 * validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3208 clear_buffer_write_io_error(sbh); 3225 clear_buffer_write_io_error(sbh);
3209 set_buffer_uptodate(sbh); 3226 set_buffer_uptodate(sbh);
3210 } 3227 }
3211 es->s_wtime = cpu_to_le32(get_seconds()); 3228 /*
3229 * If the file system is mounted read-only, don't update the
3230 * superblock write time. This avoids updating the superblock
3231 * write time when we are mounting the root file system
3232 * read/only but we need to replay the journal; at that point,
3233 * for people who are east of GMT and who make their clock
3234 * tick in localtime for Windows bug-for-bug compatibility,
3235 * the clock is set in the future, and this will cause e2fsck
3236 * to complain and force a full file system check.
3237 */
3238 if (!(sb->s_flags & MS_RDONLY))
3239 es->s_wtime = cpu_to_le32(get_seconds());
3212 es->s_kbytes_written = 3240 es->s_kbytes_written =
3213 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3241 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3214 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3242 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3477 if (sbi->s_journal) 3505 if (sbi->s_journal)
3478 ext4_mark_recovery_complete(sb, es); 3506 ext4_mark_recovery_complete(sb, es);
3479 } else { 3507 } else {
3480 int ret; 3508 /* Make sure we can mount this feature set readwrite */
3481 if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, 3509 if (!ext4_feature_set_ok(sb, 0)) {
3482 ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
3483 ext4_msg(sb, KERN_WARNING, "couldn't "
3484 "remount RDWR because of unsupported "
3485 "optional features (%x)",
3486 (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
3487 ~EXT4_FEATURE_RO_COMPAT_SUPP));
3488 err = -EROFS; 3510 err = -EROFS;
3489 goto restore_opts; 3511 goto restore_opts;
3490 } 3512 }
3491
3492 /* 3513 /*
3493 * Make sure the group descriptor checksums 3514 * Make sure the group descriptor checksums
3494 * are sane. If they aren't, refuse to remount r/w. 3515 * are sane. If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
810 get_bh(new_bh); 810 get_bh(new_bh);
811 } else { 811 } else {
812 /* We need to allocate a new block */ 812 /* We need to allocate a new block */
813 ext4_fsblk_t goal = ext4_group_first_block_no(sb, 813 ext4_fsblk_t goal, block;
814
815 goal = ext4_group_first_block_no(sb,
814 EXT4_I(inode)->i_block_group); 816 EXT4_I(inode)->i_block_group);
815 ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, 817
818 /* non-extent files can't have physical blocks past 2^32 */
819 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
820 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
821
822 block = ext4_new_meta_blocks(handle, inode,
816 goal, NULL, &error); 823 goal, NULL, &error);
817 if (error) 824 if (error)
818 goto cleanup; 825 goto cleanup;
826
827 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
828 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
829
819 ea_idebug(inode, "creating block %d", block); 830 ea_idebug(inode, "creating block %d", block);
820 831
821 new_bh = sb_getblk(sb, block); 832 new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
176 176
177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; 177 inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
178 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
179 if (IS_SYNC(inode)) 179 if (IS_SYNC(inode)) {
180 err = sync_page_range_nolock(inode, mapping, start, count); 180 int err2;
181
182 /*
183 * Opencode syncing since we don't have a file open to use
184 * standard fsync path.
185 */
186 err = filemap_fdatawrite_range(mapping, start,
187 start + count - 1);
188 err2 = sync_mapping_buffers(mapping);
189 if (!err)
190 err = err2;
191 err2 = write_inode_now(inode, 1);
192 if (!err)
193 err = err2;
194 if (!err) {
195 err = filemap_fdatawait_range(mapping, start,
196 start + count - 1);
197 }
198 }
181out: 199out:
182 return err; 200 return err;
183} 201}
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
119 MSDOS_I(inode)->i_start = new_dclus; 119 MSDOS_I(inode)->i_start = new_dclus;
120 MSDOS_I(inode)->i_logstart = new_dclus; 120 MSDOS_I(inode)->i_logstart = new_dclus;
121 /* 121 /*
122 * Since generic_osync_inode() synchronize later if 122 * Since generic_write_sync() synchronizes regular files later,
123 * this is not directory, we don't here. 123 * we sync here only directories.
124 */ 124 */
125 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { 125 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
126 ret = fat_sync_inode(inode); 126 ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/kthread.h>
23#include <linux/freezer.h>
22#include <linux/writeback.h> 24#include <linux/writeback.h>
23#include <linux/blkdev.h> 25#include <linux/blkdev.h>
24#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
25#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
26#include "internal.h" 28#include "internal.h"
27 29
30#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
28 31
29/** 32/*
30 * writeback_acquire - attempt to get exclusive writeback access to a device 33 * We don't actually have pdflush, but this one is exported though /proc...
31 * @bdi: the device's backing_dev_info structure 34 */
32 * 35int nr_pdflush_threads;
33 * It is a waste of resources to have more than one pdflush thread blocked on 36
34 * a single request queue. Exclusion at the request_queue level is obtained 37/*
35 * via a flag in the request_queue's backing_dev_info.state. 38 * Passed into wb_writeback(), essentially a subset of writeback_control
36 * 39 */
37 * Non-request_queue-backed address_spaces will share default_backing_dev_info, 40struct wb_writeback_args {
38 * unless they implement their own. Which is somewhat inefficient, as this 41 long nr_pages;
39 * may prevent concurrent writeback against multiple devices. 42 struct super_block *sb;
43 enum writeback_sync_modes sync_mode;
44 int for_kupdate;
45 int range_cyclic;
46};
47
48/*
49 * Work items for the bdi_writeback threads
40 */ 50 */
41static int writeback_acquire(struct backing_dev_info *bdi) 51struct bdi_work {
52 struct list_head list; /* pending work list */
53 struct rcu_head rcu_head; /* for RCU free/clear of work */
54
55 unsigned long seen; /* threads that have seen this work */
56 atomic_t pending; /* number of threads still to do work */
57
58 struct wb_writeback_args args; /* writeback arguments */
59
60 unsigned long state; /* flag bits, see WS_* */
61};
62
63enum {
64 WS_USED_B = 0,
65 WS_ONSTACK_B,
66};
67
68#define WS_USED (1 << WS_USED_B)
69#define WS_ONSTACK (1 << WS_ONSTACK_B)
70
71static inline bool bdi_work_on_stack(struct bdi_work *work)
72{
73 return test_bit(WS_ONSTACK_B, &work->state);
74}
75
76static inline void bdi_work_init(struct bdi_work *work,
77 struct wb_writeback_args *args)
42{ 78{
43 return !test_and_set_bit(BDI_pdflush, &bdi->state); 79 INIT_RCU_HEAD(&work->rcu_head);
80 work->args = *args;
81 work->state = WS_USED;
44} 82}
45 83
46/** 84/**
47 * writeback_in_progress - determine whether there is writeback in progress 85 * writeback_in_progress - determine whether there is writeback in progress
48 * @bdi: the device's backing_dev_info structure. 86 * @bdi: the device's backing_dev_info structure.
49 * 87 *
50 * Determine whether there is writeback in progress against a backing device. 88 * Determine whether there is writeback waiting to be handled against a
89 * backing device.
51 */ 90 */
52int writeback_in_progress(struct backing_dev_info *bdi) 91int writeback_in_progress(struct backing_dev_info *bdi)
53{ 92{
54 return test_bit(BDI_pdflush, &bdi->state); 93 return !list_empty(&bdi->work_list);
55} 94}
56 95
57/** 96static void bdi_work_clear(struct bdi_work *work)
58 * writeback_release - relinquish exclusive writeback access against a device.
59 * @bdi: the device's backing_dev_info structure
60 */
61static void writeback_release(struct backing_dev_info *bdi)
62{ 97{
63 BUG_ON(!writeback_in_progress(bdi)); 98 clear_bit(WS_USED_B, &work->state);
64 clear_bit(BDI_pdflush, &bdi->state); 99 smp_mb__after_clear_bit();
100 /*
101 * work can have disappeared at this point. bit waitq functions
102 * should be able to tolerate this, provided bdi_sched_wait does
103 * not dereference it's pointer argument.
104 */
105 wake_up_bit(&work->state, WS_USED_B);
65} 106}
66 107
67static noinline void block_dump___mark_inode_dirty(struct inode *inode) 108static void bdi_work_free(struct rcu_head *head)
68{ 109{
69 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { 110 struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
70 struct dentry *dentry;
71 const char *name = "?";
72 111
73 dentry = d_find_alias(inode); 112 if (!bdi_work_on_stack(work))
74 if (dentry) { 113 kfree(work);
75 spin_lock(&dentry->d_lock); 114 else
76 name = (const char *) dentry->d_name.name; 115 bdi_work_clear(work);
77 }
78 printk(KERN_DEBUG
79 "%s(%d): dirtied inode %lu (%s) on %s\n",
80 current->comm, task_pid_nr(current), inode->i_ino,
81 name, inode->i_sb->s_id);
82 if (dentry) {
83 spin_unlock(&dentry->d_lock);
84 dput(dentry);
85 }
86 }
87} 116}
88 117
89/** 118static void wb_work_complete(struct bdi_work *work)
90 * __mark_inode_dirty - internal function
91 * @inode: inode to mark
92 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
93 * Mark an inode as dirty. Callers should use mark_inode_dirty or
94 * mark_inode_dirty_sync.
95 *
96 * Put the inode on the super block's dirty list.
97 *
98 * CAREFUL! We mark it dirty unconditionally, but move it onto the
99 * dirty list only if it is hashed or if it refers to a blockdev.
100 * If it was not hashed, it will never be added to the dirty list
101 * even if it is later hashed, as it will have been marked dirty already.
102 *
103 * In short, make sure you hash any inodes _before_ you start marking
104 * them dirty.
105 *
106 * This function *must* be atomic for the I_DIRTY_PAGES case -
107 * set_page_dirty() is called under spinlock in several places.
108 *
109 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
110 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
111 * the kernel-internal blockdev inode represents the dirtying time of the
112 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
113 * page->mapping->host, so the page-dirtying time is recorded in the internal
114 * blockdev inode.
115 */
116void __mark_inode_dirty(struct inode *inode, int flags)
117{ 119{
118 struct super_block *sb = inode->i_sb; 120 const enum writeback_sync_modes sync_mode = work->args.sync_mode;
121 int onstack = bdi_work_on_stack(work);
119 122
120 /* 123 /*
121 * Don't do this for I_DIRTY_PAGES - that doesn't actually 124 * For allocated work, we can clear the done/seen bit right here.
122 * dirty the inode itself 125 * For on-stack work, we need to postpone both the clear and free
126 * to after the RCU grace period, since the stack could be invalidated
127 * as soon as bdi_work_clear() has done the wakeup.
123 */ 128 */
124 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 129 if (!onstack)
125 if (sb->s_op->dirty_inode) 130 bdi_work_clear(work);
126 sb->s_op->dirty_inode(inode); 131 if (sync_mode == WB_SYNC_NONE || onstack)
127 } 132 call_rcu(&work->rcu_head, bdi_work_free);
133}
128 134
135static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
136{
129 /* 137 /*
130 * make sure that changes are seen by all cpus before we test i_state 138 * The caller has retrieved the work arguments from this work,
131 * -- mikulas 139 * drop our reference. If this is the last ref, delete and free it
132 */ 140 */
133 smp_mb(); 141 if (atomic_dec_and_test(&work->pending)) {
142 struct backing_dev_info *bdi = wb->bdi;
134 143
135 /* avoid the locking if we can */ 144 spin_lock(&bdi->wb_lock);
136 if ((inode->i_state & flags) == flags) 145 list_del_rcu(&work->list);
137 return; 146 spin_unlock(&bdi->wb_lock);
138 147
139 if (unlikely(block_dump)) 148 wb_work_complete(work);
140 block_dump___mark_inode_dirty(inode); 149 }
150}
141 151
142 spin_lock(&inode_lock); 152static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
143 if ((inode->i_state & flags) != flags) { 153{
144 const int was_dirty = inode->i_state & I_DIRTY; 154 work->seen = bdi->wb_mask;
155 BUG_ON(!work->seen);
156 atomic_set(&work->pending, bdi->wb_cnt);
157 BUG_ON(!bdi->wb_cnt);
145 158
146 inode->i_state |= flags; 159 /*
160 * list_add_tail_rcu() contains the necessary barriers to
161 * make sure the above stores are seen before the item is
162 * noticed on the list
163 */
164 spin_lock(&bdi->wb_lock);
165 list_add_tail_rcu(&work->list, &bdi->work_list);
166 spin_unlock(&bdi->wb_lock);
147 167
148 /* 168 /*
149 * If the inode is being synced, just update its dirty state. 169 * If the default thread isn't there, make sure we add it. When
150 * The unlocker will place the inode on the appropriate 170 * it gets created and wakes up, we'll run this work.
151 * superblock list, based upon its state. 171 */
152 */ 172 if (unlikely(list_empty_careful(&bdi->wb_list)))
153 if (inode->i_state & I_SYNC) 173 wake_up_process(default_backing_dev_info.wb.task);
154 goto out; 174 else {
175 struct bdi_writeback *wb = &bdi->wb;
155 176
156 /* 177 if (wb->task)
157 * Only add valid (hashed) inodes to the superblock's 178 wake_up_process(wb->task);
158 * dirty list. Add blockdev inodes as well. 179 }
159 */ 180}
160 if (!S_ISBLK(inode->i_mode)) {
161 if (hlist_unhashed(&inode->i_hash))
162 goto out;
163 }
164 if (inode->i_state & (I_FREEING|I_CLEAR))
165 goto out;
166 181
167 /* 182/*
168 * If the inode was already on s_dirty/s_io/s_more_io, don't 183 * Used for on-stack allocated work items. The caller needs to wait until
169 * reposition it (that would break s_dirty time-ordering). 184 * the wb threads have acked the work before it's safe to continue.
170 */ 185 */
171 if (!was_dirty) { 186static void bdi_wait_on_work_clear(struct bdi_work *work)
172 inode->dirtied_when = jiffies; 187{
173 list_move(&inode->i_list, &sb->s_dirty); 188 wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
174 } 189 TASK_UNINTERRUPTIBLE);
190}
191
192static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
193 struct wb_writeback_args *args)
194{
195 struct bdi_work *work;
196
197 /*
198 * This is WB_SYNC_NONE writeback, so if allocation fails just
199 * wakeup the thread for old dirty data writeback
200 */
201 work = kmalloc(sizeof(*work), GFP_ATOMIC);
202 if (work) {
203 bdi_work_init(work, args);
204 bdi_queue_work(bdi, work);
205 } else {
206 struct bdi_writeback *wb = &bdi->wb;
207
208 if (wb->task)
209 wake_up_process(wb->task);
175 } 210 }
176out:
177 spin_unlock(&inode_lock);
178} 211}
179 212
180EXPORT_SYMBOL(__mark_inode_dirty); 213/**
214 * bdi_sync_writeback - start and wait for writeback
215 * @bdi: the backing device to write from
216 * @sb: write inodes from this super_block
217 *
218 * Description:
219 * This does WB_SYNC_ALL data integrity writeback and waits for the
220 * IO to complete. Callers must hold the sb s_umount semaphore for
221 * reading, to avoid having the super disappear before we are done.
222 */
223static void bdi_sync_writeback(struct backing_dev_info *bdi,
224 struct super_block *sb)
225{
226 struct wb_writeback_args args = {
227 .sb = sb,
228 .sync_mode = WB_SYNC_ALL,
229 .nr_pages = LONG_MAX,
230 .range_cyclic = 0,
231 };
232 struct bdi_work work;
181 233
182static int write_inode(struct inode *inode, int sync) 234 bdi_work_init(&work, &args);
235 work.state |= WS_ONSTACK;
236
237 bdi_queue_work(bdi, &work);
238 bdi_wait_on_work_clear(&work);
239}
240
241/**
242 * bdi_start_writeback - start writeback
243 * @bdi: the backing device to write from
244 * @nr_pages: the number of pages to write
245 *
246 * Description:
247 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
248 * started when this function returns, we make no guarentees on
249 * completion. Caller need not hold sb s_umount semaphore.
250 *
251 */
252void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
183{ 253{
184 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) 254 struct wb_writeback_args args = {
185 return inode->i_sb->s_op->write_inode(inode, sync); 255 .sync_mode = WB_SYNC_NONE,
186 return 0; 256 .nr_pages = nr_pages,
257 .range_cyclic = 1,
258 };
259
260 bdi_alloc_queue_work(bdi, &args);
187} 261}
188 262
189/* 263/*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
191 * furthest end of its superblock's dirty-inode list. 265 * furthest end of its superblock's dirty-inode list.
192 * 266 *
193 * Before stamping the inode's ->dirtied_when, we check to see whether it is 267 * Before stamping the inode's ->dirtied_when, we check to see whether it is
194 * already the most-recently-dirtied inode on the s_dirty list. If that is 268 * already the most-recently-dirtied inode on the b_dirty list. If that is
195 * the case then the inode must have been redirtied while it was being written 269 * the case then the inode must have been redirtied while it was being written
196 * out and we don't reset its dirtied_when. 270 * out and we don't reset its dirtied_when.
197 */ 271 */
198static void redirty_tail(struct inode *inode) 272static void redirty_tail(struct inode *inode)
199{ 273{
200 struct super_block *sb = inode->i_sb; 274 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
201 275
202 if (!list_empty(&sb->s_dirty)) { 276 if (!list_empty(&wb->b_dirty)) {
203 struct inode *tail_inode; 277 struct inode *tail;
204 278
205 tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); 279 tail = list_entry(wb->b_dirty.next, struct inode, i_list);
206 if (time_before(inode->dirtied_when, 280 if (time_before(inode->dirtied_when, tail->dirtied_when))
207 tail_inode->dirtied_when))
208 inode->dirtied_when = jiffies; 281 inode->dirtied_when = jiffies;
209 } 282 }
210 list_move(&inode->i_list, &sb->s_dirty); 283 list_move(&inode->i_list, &wb->b_dirty);
211} 284}
212 285
213/* 286/*
214 * requeue inode for re-scanning after sb->s_io list is exhausted. 287 * requeue inode for re-scanning after bdi->b_io list is exhausted.
215 */ 288 */
216static void requeue_io(struct inode *inode) 289static void requeue_io(struct inode *inode)
217{ 290{
218 list_move(&inode->i_list, &inode->i_sb->s_more_io); 291 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
292
293 list_move(&inode->i_list, &wb->b_more_io);
219} 294}
220 295
221static void inode_sync_complete(struct inode *inode) 296static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
262/* 337/*
263 * Queue all expired dirty inodes for io, eldest first. 338 * Queue all expired dirty inodes for io, eldest first.
264 */ 339 */
265static void queue_io(struct super_block *sb, 340static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
266 unsigned long *older_than_this)
267{ 341{
268 list_splice_init(&sb->s_more_io, sb->s_io.prev); 342 list_splice_init(&wb->b_more_io, wb->b_io.prev);
269 move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); 343 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
270} 344}
271 345
272int sb_has_dirty_inodes(struct super_block *sb) 346static int write_inode(struct inode *inode, int sync)
273{ 347{
274 return !list_empty(&sb->s_dirty) || 348 if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
275 !list_empty(&sb->s_io) || 349 return inode->i_sb->s_op->write_inode(inode, sync);
276 !list_empty(&sb->s_more_io); 350 return 0;
277} 351}
278EXPORT_SYMBOL(sb_has_dirty_inodes);
279 352
280/* 353/*
281 * Wait for writeback on an inode to complete. 354 * Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
322 if (inode->i_state & I_SYNC) { 395 if (inode->i_state & I_SYNC) {
323 /* 396 /*
324 * If this inode is locked for writeback and we are not doing 397 * If this inode is locked for writeback and we are not doing
325 * writeback-for-data-integrity, move it to s_more_io so that 398 * writeback-for-data-integrity, move it to b_more_io so that
326 * writeback can proceed with the other inodes on s_io. 399 * writeback can proceed with the other inodes on s_io.
327 * 400 *
328 * We'll have another go at writing back this inode when we 401 * We'll have another go at writing back this inode when we
329 * completed a full scan of s_io. 402 * completed a full scan of b_io.
330 */ 403 */
331 if (!wait) { 404 if (!wait) {
332 requeue_io(inode); 405 requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
371 /* 444 /*
372 * We didn't write back all the pages. nfs_writepages() 445 * We didn't write back all the pages. nfs_writepages()
373 * sometimes bales out without doing anything. Redirty 446 * sometimes bales out without doing anything. Redirty
374 * the inode; Move it from s_io onto s_more_io/s_dirty. 447 * the inode; Move it from b_io onto b_more_io/b_dirty.
375 */ 448 */
376 /* 449 /*
377 * akpm: if the caller was the kupdate function we put 450 * akpm: if the caller was the kupdate function we put
378 * this inode at the head of s_dirty so it gets first 451 * this inode at the head of b_dirty so it gets first
379 * consideration. Otherwise, move it to the tail, for 452 * consideration. Otherwise, move it to the tail, for
380 * the reasons described there. I'm not really sure 453 * the reasons described there. I'm not really sure
381 * how much sense this makes. Presumably I had a good 454 * how much sense this makes. Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
385 if (wbc->for_kupdate) { 458 if (wbc->for_kupdate) {
386 /* 459 /*
387 * For the kupdate function we move the inode 460 * For the kupdate function we move the inode
388 * to s_more_io so it will get more writeout as 461 * to b_more_io so it will get more writeout as
389 * soon as the queue becomes uncongested. 462 * soon as the queue becomes uncongested.
390 */ 463 */
391 inode->i_state |= I_DIRTY_PAGES; 464 inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
434} 507}
435 508
436/* 509/*
437 * Write out a superblock's list of dirty inodes. A wait will be performed 510 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
438 * upon no inodes, all inodes or the final one, depending upon sync_mode. 511 * before calling writeback. So make sure that we do pin it, so it doesn't
439 * 512 * go away while we are writing inodes from it.
440 * If older_than_this is non-NULL, then only write out inodes which
441 * had their first dirtying at a time earlier than *older_than_this.
442 *
443 * If we're a pdflush thread, then implement pdflush collision avoidance
444 * against the entire list.
445 * 513 *
446 * If `bdi' is non-zero then we're being asked to writeback a specific queue. 514 * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
447 * This function assumes that the blockdev superblock's inodes are backed by 515 * 1 if we failed.
448 * a variety of queues, so all inodes are searched. For other superblocks,
449 * assume that all inodes are backed by the same queue.
450 *
451 * FIXME: this linear search could get expensive with many fileystems. But
452 * how to fix? We need to go from an address_space to all inodes which share
453 * a queue with that address_space. (Easy: have a global "dirty superblocks"
454 * list).
455 *
456 * The inodes to be written are parked on sb->s_io. They are moved back onto
457 * sb->s_dirty as they are selected for writing. This way, none can be missed
458 * on the writer throttling path, and we get decent balancing between many
459 * throttled threads: we don't want them all piling up on inode_sync_wait.
460 */ 516 */
461void generic_sync_sb_inodes(struct super_block *sb, 517static int pin_sb_for_writeback(struct writeback_control *wbc,
518 struct inode *inode)
519{
520 struct super_block *sb = inode->i_sb;
521
522 /*
523 * Caller must already hold the ref for this
524 */
525 if (wbc->sync_mode == WB_SYNC_ALL) {
526 WARN_ON(!rwsem_is_locked(&sb->s_umount));
527 return 0;
528 }
529
530 spin_lock(&sb_lock);
531 sb->s_count++;
532 if (down_read_trylock(&sb->s_umount)) {
533 if (sb->s_root) {
534 spin_unlock(&sb_lock);
535 return 0;
536 }
537 /*
538 * umounted, drop rwsem again and fall through to failure
539 */
540 up_read(&sb->s_umount);
541 }
542
543 sb->s_count--;
544 spin_unlock(&sb_lock);
545 return 1;
546}
547
548static void unpin_sb_for_writeback(struct writeback_control *wbc,
549 struct inode *inode)
550{
551 struct super_block *sb = inode->i_sb;
552
553 if (wbc->sync_mode == WB_SYNC_ALL)
554 return;
555
556 up_read(&sb->s_umount);
557 put_super(sb);
558}
559
560static void writeback_inodes_wb(struct bdi_writeback *wb,
462 struct writeback_control *wbc) 561 struct writeback_control *wbc)
463{ 562{
563 struct super_block *sb = wbc->sb;
564 const int is_blkdev_sb = sb_is_blkdev_sb(sb);
464 const unsigned long start = jiffies; /* livelock avoidance */ 565 const unsigned long start = jiffies; /* livelock avoidance */
465 int sync = wbc->sync_mode == WB_SYNC_ALL;
466 566
467 spin_lock(&inode_lock); 567 spin_lock(&inode_lock);
468 if (!wbc->for_kupdate || list_empty(&sb->s_io))
469 queue_io(sb, wbc->older_than_this);
470 568
471 while (!list_empty(&sb->s_io)) { 569 if (!wbc->for_kupdate || list_empty(&wb->b_io))
472 struct inode *inode = list_entry(sb->s_io.prev, 570 queue_io(wb, wbc->older_than_this);
571
572 while (!list_empty(&wb->b_io)) {
573 struct inode *inode = list_entry(wb->b_io.prev,
473 struct inode, i_list); 574 struct inode, i_list);
474 struct address_space *mapping = inode->i_mapping;
475 struct backing_dev_info *bdi = mapping->backing_dev_info;
476 long pages_skipped; 575 long pages_skipped;
477 576
478 if (!bdi_cap_writeback_dirty(bdi)) { 577 /*
578 * super block given and doesn't match, skip this inode
579 */
580 if (sb && sb != inode->i_sb) {
581 redirty_tail(inode);
582 continue;
583 }
584
585 if (!bdi_cap_writeback_dirty(wb->bdi)) {
479 redirty_tail(inode); 586 redirty_tail(inode);
480 if (sb_is_blkdev_sb(sb)) { 587 if (is_blkdev_sb) {
481 /* 588 /*
482 * Dirty memory-backed blockdev: the ramdisk 589 * Dirty memory-backed blockdev: the ramdisk
483 * driver does this. Skip just this inode 590 * driver does this. Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
497 continue; 604 continue;
498 } 605 }
499 606
500 if (wbc->nonblocking && bdi_write_congested(bdi)) { 607 if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
501 wbc->encountered_congestion = 1; 608 wbc->encountered_congestion = 1;
502 if (!sb_is_blkdev_sb(sb)) 609 if (!is_blkdev_sb)
503 break; /* Skip a congested fs */ 610 break; /* Skip a congested fs */
504 requeue_io(inode); 611 requeue_io(inode);
505 continue; /* Skip a congested blockdev */ 612 continue; /* Skip a congested blockdev */
506 } 613 }
507 614
508 if (wbc->bdi && bdi != wbc->bdi) {
509 if (!sb_is_blkdev_sb(sb))
510 break; /* fs has the wrong queue */
511 requeue_io(inode);
512 continue; /* blockdev has wrong queue */
513 }
514
515 /* 615 /*
516 * Was this inode dirtied after sync_sb_inodes was called? 616 * Was this inode dirtied after sync_sb_inodes was called?
517 * This keeps sync from extra jobs and livelock. 617 * This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
519 if (inode_dirtied_after(inode, start)) 619 if (inode_dirtied_after(inode, start))
520 break; 620 break;
521 621
522 /* Is another pdflush already flushing this queue? */ 622 if (pin_sb_for_writeback(wbc, inode)) {
523 if (current_is_pdflush() && !writeback_acquire(bdi)) 623 requeue_io(inode);
524 break; 624 continue;
625 }
525 626
526 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); 627 BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
527 __iget(inode); 628 __iget(inode);
528 pages_skipped = wbc->pages_skipped; 629 pages_skipped = wbc->pages_skipped;
529 writeback_single_inode(inode, wbc); 630 writeback_single_inode(inode, wbc);
530 if (current_is_pdflush()) 631 unpin_sb_for_writeback(wbc, inode);
531 writeback_release(bdi);
532 if (wbc->pages_skipped != pages_skipped) { 632 if (wbc->pages_skipped != pages_skipped) {
533 /* 633 /*
534 * writeback is not making progress due to locked 634 * writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
544 wbc->more_io = 1; 644 wbc->more_io = 1;
545 break; 645 break;
546 } 646 }
547 if (!list_empty(&sb->s_more_io)) 647 if (!list_empty(&wb->b_more_io))
548 wbc->more_io = 1; 648 wbc->more_io = 1;
549 } 649 }
550 650
551 if (sync) { 651 spin_unlock(&inode_lock);
552 struct inode *inode, *old_inode = NULL; 652 /* Leave any unwritten inodes on b_io */
653}
654
655void writeback_inodes_wbc(struct writeback_control *wbc)
656{
657 struct backing_dev_info *bdi = wbc->bdi;
553 658
659 writeback_inodes_wb(&bdi->wb, wbc);
660}
661
662/*
663 * The maximum number of pages to writeout in a single bdi flush/kupdate
664 * operation. We do this so we don't hold I_SYNC against an inode for
665 * enormous amounts of time, which would block a userspace task which has
666 * been forced to throttle against that inode. Also, the code reevaluates
667 * the dirty each time it has written this many pages.
668 */
669#define MAX_WRITEBACK_PAGES 1024
670
671static inline bool over_bground_thresh(void)
672{
673 unsigned long background_thresh, dirty_thresh;
674
675 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
676
677 return (global_page_state(NR_FILE_DIRTY) +
678 global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
679}
680
681/*
682 * Explicit flushing or periodic writeback of "old" data.
683 *
684 * Define "old": the first time one of an inode's pages is dirtied, we mark the
685 * dirtying-time in the inode's address_space. So this periodic writeback code
686 * just walks the superblock inode list, writing back any inodes which are
687 * older than a specific point in time.
688 *
689 * Try to run once per dirty_writeback_interval. But if a writeback event
690 * takes longer than a dirty_writeback_interval interval, then leave a
691 * one-second gap.
692 *
693 * older_than_this takes precedence over nr_to_write. So we'll only write back
694 * all dirty pages if they are all attached to "old" mappings.
695 */
696static long wb_writeback(struct bdi_writeback *wb,
697 struct wb_writeback_args *args)
698{
699 struct writeback_control wbc = {
700 .bdi = wb->bdi,
701 .sb = args->sb,
702 .sync_mode = args->sync_mode,
703 .older_than_this = NULL,
704 .for_kupdate = args->for_kupdate,
705 .range_cyclic = args->range_cyclic,
706 };
707 unsigned long oldest_jif;
708 long wrote = 0;
709
710 if (wbc.for_kupdate) {
711 wbc.older_than_this = &oldest_jif;
712 oldest_jif = jiffies -
713 msecs_to_jiffies(dirty_expire_interval * 10);
714 }
715 if (!wbc.range_cyclic) {
716 wbc.range_start = 0;
717 wbc.range_end = LLONG_MAX;
718 }
719
720 for (;;) {
554 /* 721 /*
555 * Data integrity sync. Must wait for all pages under writeback, 722 * Don't flush anything for non-integrity writeback where
556 * because there may have been pages dirtied before our sync 723 * no nr_pages was given
557 * call, but which had writeout started before we write it out.
558 * In which case, the inode may not be on the dirty list, but
559 * we still have to wait for that writeout.
560 */ 724 */
561 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 725 if (!args->for_kupdate && args->nr_pages <= 0 &&
562 struct address_space *mapping; 726 args->sync_mode == WB_SYNC_NONE)
727 break;
563 728
564 if (inode->i_state & 729 /*
565 (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) 730 * If no specific pages were given and this is just a
566 continue; 731 * periodic background writeout and we are below the
567 mapping = inode->i_mapping; 732 * background dirty threshold, don't do anything
568 if (mapping->nrpages == 0) 733 */
734 if (args->for_kupdate && args->nr_pages <= 0 &&
735 !over_bground_thresh())
736 break;
737
738 wbc.more_io = 0;
739 wbc.encountered_congestion = 0;
740 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
741 wbc.pages_skipped = 0;
742 writeback_inodes_wb(wb, &wbc);
743 args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
744 wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
745
746 /*
747 * If we ran out of stuff to write, bail unless more_io got set
748 */
749 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
750 if (wbc.more_io && !wbc.for_kupdate)
569 continue; 751 continue;
570 __iget(inode); 752 break;
571 spin_unlock(&inode_lock); 753 }
572 /* 754 }
573 * We hold a reference to 'inode' so it couldn't have 755
574 * been removed from s_inodes list while we dropped the 756 return wrote;
575 * inode_lock. We cannot iput the inode now as we can 757}
576 * be holding the last reference and we cannot iput it 758
577 * under inode_lock. So we keep the reference and iput 759/*
578 * it later. 760 * Return the next bdi_work struct that hasn't been processed by this
579 */ 761 * wb thread yet. ->seen is initially set for each thread that exists
580 iput(old_inode); 762 * for this device, when a thread first notices a piece of work it
581 old_inode = inode; 763 * clears its bit. Depending on writeback type, the thread will notify
764 * completion on either receiving the work (WB_SYNC_NONE) or after
765 * it is done (WB_SYNC_ALL).
766 */
767static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
768 struct bdi_writeback *wb)
769{
770 struct bdi_work *work, *ret = NULL;
771
772 rcu_read_lock();
773
774 list_for_each_entry_rcu(work, &bdi->work_list, list) {
775 if (!test_bit(wb->nr, &work->seen))
776 continue;
777 clear_bit(wb->nr, &work->seen);
778
779 ret = work;
780 break;
781 }
782
783 rcu_read_unlock();
784 return ret;
785}
786
787static long wb_check_old_data_flush(struct bdi_writeback *wb)
788{
789 unsigned long expired;
790 long nr_pages;
791
792 expired = wb->last_old_flush +
793 msecs_to_jiffies(dirty_writeback_interval * 10);
794 if (time_before(jiffies, expired))
795 return 0;
796
797 wb->last_old_flush = jiffies;
798 nr_pages = global_page_state(NR_FILE_DIRTY) +
799 global_page_state(NR_UNSTABLE_NFS) +
800 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
801
802 if (nr_pages) {
803 struct wb_writeback_args args = {
804 .nr_pages = nr_pages,
805 .sync_mode = WB_SYNC_NONE,
806 .for_kupdate = 1,
807 .range_cyclic = 1,
808 };
809
810 return wb_writeback(wb, &args);
811 }
812
813 return 0;
814}
815
816/*
817 * Retrieve work items and do the writeback they describe
818 */
819long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
820{
821 struct backing_dev_info *bdi = wb->bdi;
822 struct bdi_work *work;
823 long wrote = 0;
582 824
583 filemap_fdatawait(mapping); 825 while ((work = get_next_work_item(bdi, wb)) != NULL) {
826 struct wb_writeback_args args = work->args;
584 827
585 cond_resched(); 828 /*
829 * Override sync mode, in case we must wait for completion
830 */
831 if (force_wait)
832 work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
586 833
587 spin_lock(&inode_lock); 834 /*
835 * If this isn't a data integrity operation, just notify
836 * that we have seen this work and we are now starting it.
837 */
838 if (args.sync_mode == WB_SYNC_NONE)
839 wb_clear_pending(wb, work);
840
841 wrote += wb_writeback(wb, &args);
842
843 /*
844 * This is a data integrity writeback, so only do the
845 * notification when we have completed the work.
846 */
847 if (args.sync_mode == WB_SYNC_ALL)
848 wb_clear_pending(wb, work);
849 }
850
851 /*
852 * Check for periodic writeback, kupdated() style
853 */
854 wrote += wb_check_old_data_flush(wb);
855
856 return wrote;
857}
858
859/*
860 * Handle writeback of dirty data for the device backed by this bdi. Also
861 * wakes up periodically and does kupdated style flushing.
862 */
863int bdi_writeback_task(struct bdi_writeback *wb)
864{
865 unsigned long last_active = jiffies;
866 unsigned long wait_jiffies = -1UL;
867 long pages_written;
868
869 while (!kthread_should_stop()) {
870 pages_written = wb_do_writeback(wb, 0);
871
872 if (pages_written)
873 last_active = jiffies;
874 else if (wait_jiffies != -1UL) {
875 unsigned long max_idle;
876
877 /*
878 * Longest period of inactivity that we tolerate. If we
879 * see dirty data again later, the task will get
880 * recreated automatically.
881 */
882 max_idle = max(5UL * 60 * HZ, wait_jiffies);
883 if (time_after(jiffies, max_idle + last_active))
884 break;
588 } 885 }
589 spin_unlock(&inode_lock);
590 iput(old_inode);
591 } else
592 spin_unlock(&inode_lock);
593 886
594 return; /* Leave any unwritten inodes on s_io */ 887 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
888 schedule_timeout_interruptible(wait_jiffies);
889 try_to_freeze();
890 }
891
892 return 0;
595} 893}
596EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
597 894
598static void sync_sb_inodes(struct super_block *sb, 895/*
599 struct writeback_control *wbc) 896 * Schedule writeback for all backing devices. This does WB_SYNC_NONE
897 * writeback, for integrity writeback see bdi_sync_writeback().
898 */
899static void bdi_writeback_all(struct super_block *sb, long nr_pages)
600{ 900{
601 generic_sync_sb_inodes(sb, wbc); 901 struct wb_writeback_args args = {
902 .sb = sb,
903 .nr_pages = nr_pages,
904 .sync_mode = WB_SYNC_NONE,
905 };
906 struct backing_dev_info *bdi;
907
908 rcu_read_lock();
909
910 list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
911 if (!bdi_has_dirty_io(bdi))
912 continue;
913
914 bdi_alloc_queue_work(bdi, &args);
915 }
916
917 rcu_read_unlock();
602} 918}
603 919
604/* 920/*
605 * Start writeback of dirty pagecache data against all unlocked inodes. 921 * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
922 * the whole world.
923 */
924void wakeup_flusher_threads(long nr_pages)
925{
926 if (nr_pages == 0)
927 nr_pages = global_page_state(NR_FILE_DIRTY) +
928 global_page_state(NR_UNSTABLE_NFS);
929 bdi_writeback_all(NULL, nr_pages);
930}
931
932static noinline void block_dump___mark_inode_dirty(struct inode *inode)
933{
934 if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
935 struct dentry *dentry;
936 const char *name = "?";
937
938 dentry = d_find_alias(inode);
939 if (dentry) {
940 spin_lock(&dentry->d_lock);
941 name = (const char *) dentry->d_name.name;
942 }
943 printk(KERN_DEBUG
944 "%s(%d): dirtied inode %lu (%s) on %s\n",
945 current->comm, task_pid_nr(current), inode->i_ino,
946 name, inode->i_sb->s_id);
947 if (dentry) {
948 spin_unlock(&dentry->d_lock);
949 dput(dentry);
950 }
951 }
952}
953
954/**
955 * __mark_inode_dirty - internal function
956 * @inode: inode to mark
957 * @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
958 * Mark an inode as dirty. Callers should use mark_inode_dirty or
959 * mark_inode_dirty_sync.
960 *
961 * Put the inode on the super block's dirty list.
606 * 962 *
607 * Note: 963 * CAREFUL! We mark it dirty unconditionally, but move it onto the
608 * We don't need to grab a reference to superblock here. If it has non-empty 964 * dirty list only if it is hashed or if it refers to a blockdev.
609 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed 965 * If it was not hashed, it will never be added to the dirty list
610 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all 966 * even if it is later hashed, as it will have been marked dirty already.
611 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
612 * inode from superblock lists we are OK.
613 * 967 *
614 * If `older_than_this' is non-zero then only flush inodes which have a 968 * In short, make sure you hash any inodes _before_ you start marking
615 * flushtime older than *older_than_this. 969 * them dirty.
616 * 970 *
617 * If `bdi' is non-zero then we will scan the first inode against each 971 * This function *must* be atomic for the I_DIRTY_PAGES case -
618 * superblock until we find the matching ones. One group will be the dirty 972 * set_page_dirty() is called under spinlock in several places.
619 * inodes against a filesystem. Then when we hit the dummy blockdev superblock, 973 *
620 * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not 974 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
621 * super-efficient but we're about to do a ton of I/O... 975 * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
976 * the kernel-internal blockdev inode represents the dirtying time of the
977 * blockdev's pages. This is why for I_DIRTY_PAGES we always use
978 * page->mapping->host, so the page-dirtying time is recorded in the internal
979 * blockdev inode.
622 */ 980 */
623void 981void __mark_inode_dirty(struct inode *inode, int flags)
624writeback_inodes(struct writeback_control *wbc)
625{ 982{
626 struct super_block *sb; 983 struct super_block *sb = inode->i_sb;
627 984
628 might_sleep(); 985 /*
629 spin_lock(&sb_lock); 986 * Don't do this for I_DIRTY_PAGES - that doesn't actually
630restart: 987 * dirty the inode itself
631 list_for_each_entry_reverse(sb, &super_blocks, s_list) { 988 */
632 if (sb_has_dirty_inodes(sb)) { 989 if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
633 /* we're making our own get_super here */ 990 if (sb->s_op->dirty_inode)
634 sb->s_count++; 991 sb->s_op->dirty_inode(inode);
635 spin_unlock(&sb_lock); 992 }
636 /* 993
637 * If we can't get the readlock, there's no sense in 994 /*
638 * waiting around, most of the time the FS is going to 995 * make sure that changes are seen by all cpus before we test i_state
639 * be unmounted by the time it is released. 996 * -- mikulas
640 */ 997 */
641 if (down_read_trylock(&sb->s_umount)) { 998 smp_mb();
642 if (sb->s_root) 999
643 sync_sb_inodes(sb, wbc); 1000 /* avoid the locking if we can */
644 up_read(&sb->s_umount); 1001 if ((inode->i_state & flags) == flags)
1002 return;
1003
1004 if (unlikely(block_dump))
1005 block_dump___mark_inode_dirty(inode);
1006
1007 spin_lock(&inode_lock);
1008 if ((inode->i_state & flags) != flags) {
1009 const int was_dirty = inode->i_state & I_DIRTY;
1010
1011 inode->i_state |= flags;
1012
1013 /*
1014 * If the inode is being synced, just update its dirty state.
1015 * The unlocker will place the inode on the appropriate
1016 * superblock list, based upon its state.
1017 */
1018 if (inode->i_state & I_SYNC)
1019 goto out;
1020
1021 /*
1022 * Only add valid (hashed) inodes to the superblock's
1023 * dirty list. Add blockdev inodes as well.
1024 */
1025 if (!S_ISBLK(inode->i_mode)) {
1026 if (hlist_unhashed(&inode->i_hash))
1027 goto out;
1028 }
1029 if (inode->i_state & (I_FREEING|I_CLEAR))
1030 goto out;
1031
1032 /*
1033 * If the inode was already on b_dirty/b_io/b_more_io, don't
1034 * reposition it (that would break b_dirty time-ordering).
1035 */
1036 if (!was_dirty) {
1037 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
1038 struct backing_dev_info *bdi = wb->bdi;
1039
1040 if (bdi_cap_writeback_dirty(bdi) &&
1041 !test_bit(BDI_registered, &bdi->state)) {
1042 WARN_ON(1);
1043 printk(KERN_ERR "bdi-%s not registered\n",
1044 bdi->name);
645 } 1045 }
646 spin_lock(&sb_lock); 1046
647 if (__put_super_and_need_restart(sb)) 1047 inode->dirtied_when = jiffies;
648 goto restart; 1048 list_move(&inode->i_list, &wb->b_dirty);
649 } 1049 }
650 if (wbc->nr_to_write <= 0)
651 break;
652 } 1050 }
653 spin_unlock(&sb_lock); 1051out:
1052 spin_unlock(&inode_lock);
654} 1053}
1054EXPORT_SYMBOL(__mark_inode_dirty);
655 1055
656/* 1056/*
657 * writeback and wait upon the filesystem's dirty inodes. The caller will 1057 * Write out a superblock's list of dirty inodes. A wait will be performed
658 * do this in two passes - one to write, and one to wait. 1058 * upon no inodes, all inodes or the final one, depending upon sync_mode.
1059 *
1060 * If older_than_this is non-NULL, then only write out inodes which
1061 * had their first dirtying at a time earlier than *older_than_this.
1062 *
1063 * If we're a pdlfush thread, then implement pdflush collision avoidance
1064 * against the entire list.
659 * 1065 *
660 * A finite limit is set on the number of pages which will be written. 1066 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
661 * To prevent infinite livelock of sys_sync(). 1067 * This function assumes that the blockdev superblock's inodes are backed by
1068 * a variety of queues, so all inodes are searched. For other superblocks,
1069 * assume that all inodes are backed by the same queue.
662 * 1070 *
663 * We add in the number of potentially dirty inodes, because each inode write 1071 * The inodes to be written are parked on bdi->b_io. They are moved back onto
664 * can dirty pagecache in the underlying blockdev. 1072 * bdi->b_dirty as they are selected for writing. This way, none can be missed
1073 * on the writer throttling path, and we get decent balancing between many
1074 * throttled threads: we don't want them all piling up on inode_sync_wait.
665 */ 1075 */
666void sync_inodes_sb(struct super_block *sb, int wait) 1076static void wait_sb_inodes(struct super_block *sb)
667{ 1077{
668 struct writeback_control wbc = { 1078 struct inode *inode, *old_inode = NULL;
669 .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, 1079
670 .range_start = 0, 1080 /*
671 .range_end = LLONG_MAX, 1081 * We need to be protected against the filesystem going from
672 }; 1082 * r/o to r/w or vice versa.
1083 */
1084 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1085
1086 spin_lock(&inode_lock);
1087
1088 /*
1089 * Data integrity sync. Must wait for all pages under writeback,
1090 * because there may have been pages dirtied before our sync
1091 * call, but which had writeout started before we write it out.
1092 * In which case, the inode may not be on the dirty list, but
1093 * we still have to wait for that writeout.
1094 */
1095 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1096 struct address_space *mapping;
1097
1098 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
1099 continue;
1100 mapping = inode->i_mapping;
1101 if (mapping->nrpages == 0)
1102 continue;
1103 __iget(inode);
1104 spin_unlock(&inode_lock);
1105 /*
1106 * We hold a reference to 'inode' so it couldn't have
1107 * been removed from s_inodes list while we dropped the
1108 * inode_lock. We cannot iput the inode now as we can
1109 * be holding the last reference and we cannot iput it
1110 * under inode_lock. So we keep the reference and iput
1111 * it later.
1112 */
1113 iput(old_inode);
1114 old_inode = inode;
1115
1116 filemap_fdatawait(mapping);
1117
1118 cond_resched();
673 1119
674 if (!wait) { 1120 spin_lock(&inode_lock);
675 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); 1121 }
676 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); 1122 spin_unlock(&inode_lock);
1123 iput(old_inode);
1124}
677 1125
678 wbc.nr_to_write = nr_dirty + nr_unstable + 1126/**
1127 * writeback_inodes_sb - writeback dirty inodes from given super_block
1128 * @sb: the superblock
1129 *
1130 * Start writeback on some inodes on this super_block. No guarantees are made
1131 * on how many (if any) will be written, and this function does not wait
1132 * for IO completion of submitted IO. The number of pages submitted is
1133 * returned.
1134 */
1135void writeback_inodes_sb(struct super_block *sb)
1136{
1137 unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
1138 unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
1139 long nr_to_write;
1140
1141 nr_to_write = nr_dirty + nr_unstable +
679 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 1142 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
680 } else
681 wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
682 1143
683 sync_sb_inodes(sb, &wbc); 1144 bdi_writeback_all(sb, nr_to_write);
1145}
1146EXPORT_SYMBOL(writeback_inodes_sb);
1147
1148/**
1149 * sync_inodes_sb - sync sb inode pages
1150 * @sb: the superblock
1151 *
1152 * This function writes and waits on any dirty inode belonging to this
1153 * super_block. The number of pages synced is returned.
1154 */
1155void sync_inodes_sb(struct super_block *sb)
1156{
1157 bdi_sync_writeback(sb->s_bdi, sb);
1158 wait_sb_inodes(sb);
684} 1159}
1160EXPORT_SYMBOL(sync_inodes_sb);
685 1161
686/** 1162/**
687 * write_inode_now - write an inode to disk 1163 * write_inode_now - write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
737 return ret; 1213 return ret;
738} 1214}
739EXPORT_SYMBOL(sync_inode); 1215EXPORT_SYMBOL(sync_inode);
740
741/**
742 * generic_osync_inode - flush all dirty data for a given inode to disk
743 * @inode: inode to write
744 * @mapping: the address_space that should be flushed
745 * @what: what to write and wait upon
746 *
747 * This can be called by file_write functions for files which have the
748 * O_SYNC flag set, to flush dirty writes to disk.
749 *
750 * @what is a bitmask, specifying which part of the inode's data should be
751 * written and waited upon.
752 *
753 * OSYNC_DATA: i_mapping's dirty data
754 * OSYNC_METADATA: the buffers at i_mapping->private_list
755 * OSYNC_INODE: the inode itself
756 */
757
758int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
759{
760 int err = 0;
761 int need_write_inode_now = 0;
762 int err2;
763
764 if (what & OSYNC_DATA)
765 err = filemap_fdatawrite(mapping);
766 if (what & (OSYNC_METADATA|OSYNC_DATA)) {
767 err2 = sync_mapping_buffers(mapping);
768 if (!err)
769 err = err2;
770 }
771 if (what & OSYNC_DATA) {
772 err2 = filemap_fdatawait(mapping);
773 if (!err)
774 err = err2;
775 }
776
777 spin_lock(&inode_lock);
778 if ((inode->i_state & I_DIRTY) &&
779 ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
780 need_write_inode_now = 1;
781 spin_unlock(&inode_lock);
782
783 if (need_write_inode_now) {
784 err2 = write_inode_now(inode, 1);
785 if (!err)
786 err = err2;
787 }
788 else
789 inode_sync_wait(inode);
790
791 return err;
792}
793EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
61 return simple_read_from_buffer(buf, len, ppos, tmp, size); 61 return simple_read_from_buffer(buf, len, ppos, tmp, size);
62} 62}
63 63
64static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
65 size_t len, loff_t *ppos, unsigned val)
66{
67 char tmp[32];
68 size_t size = sprintf(tmp, "%u\n", val);
69
70 return simple_read_from_buffer(buf, len, ppos, tmp, size);
71}
72
73static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
74 size_t count, loff_t *ppos, unsigned *val,
75 unsigned global_limit)
76{
77 unsigned long t;
78 char tmp[32];
79 unsigned limit = (1 << 16) - 1;
80 int err;
81
82 if (*ppos || count >= sizeof(tmp) - 1)
83 return -EINVAL;
84
85 if (copy_from_user(tmp, buf, count))
86 return -EINVAL;
87
88 tmp[count] = '\0';
89
90 err = strict_strtoul(tmp, 0, &t);
91 if (err)
92 return err;
93
94 if (!capable(CAP_SYS_ADMIN))
95 limit = min(limit, global_limit);
96
97 if (t > limit)
98 return -EINVAL;
99
100 *val = t;
101
102 return count;
103}
104
105static ssize_t fuse_conn_max_background_read(struct file *file,
106 char __user *buf, size_t len,
107 loff_t *ppos)
108{
109 struct fuse_conn *fc;
110 unsigned val;
111
112 fc = fuse_ctl_file_conn_get(file);
113 if (!fc)
114 return 0;
115
116 val = fc->max_background;
117 fuse_conn_put(fc);
118
119 return fuse_conn_limit_read(file, buf, len, ppos, val);
120}
121
122static ssize_t fuse_conn_max_background_write(struct file *file,
123 const char __user *buf,
124 size_t count, loff_t *ppos)
125{
126 unsigned val;
127 ssize_t ret;
128
129 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
130 max_user_bgreq);
131 if (ret > 0) {
132 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
133 if (fc) {
134 fc->max_background = val;
135 fuse_conn_put(fc);
136 }
137 }
138
139 return ret;
140}
141
142static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
143 char __user *buf, size_t len,
144 loff_t *ppos)
145{
146 struct fuse_conn *fc;
147 unsigned val;
148
149 fc = fuse_ctl_file_conn_get(file);
150 if (!fc)
151 return 0;
152
153 val = fc->congestion_threshold;
154 fuse_conn_put(fc);
155
156 return fuse_conn_limit_read(file, buf, len, ppos, val);
157}
158
159static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
160 const char __user *buf,
161 size_t count, loff_t *ppos)
162{
163 unsigned val;
164 ssize_t ret;
165
166 ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
167 max_user_congthresh);
168 if (ret > 0) {
169 struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
170 if (fc) {
171 fc->congestion_threshold = val;
172 fuse_conn_put(fc);
173 }
174 }
175
176 return ret;
177}
178
64static const struct file_operations fuse_ctl_abort_ops = { 179static const struct file_operations fuse_ctl_abort_ops = {
65 .open = nonseekable_open, 180 .open = nonseekable_open,
66 .write = fuse_conn_abort_write, 181 .write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
71 .read = fuse_conn_waiting_read, 186 .read = fuse_conn_waiting_read,
72}; 187};
73 188
189static const struct file_operations fuse_conn_max_background_ops = {
190 .open = nonseekable_open,
191 .read = fuse_conn_max_background_read,
192 .write = fuse_conn_max_background_write,
193};
194
195static const struct file_operations fuse_conn_congestion_threshold_ops = {
196 .open = nonseekable_open,
197 .read = fuse_conn_congestion_threshold_read,
198 .write = fuse_conn_congestion_threshold_write,
199};
200
74static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, 201static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
75 struct fuse_conn *fc, 202 struct fuse_conn *fc,
76 const char *name, 203 const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
127 goto err; 254 goto err;
128 255
129 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, 256 if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
130 NULL, &fuse_ctl_waiting_ops) || 257 NULL, &fuse_ctl_waiting_ops) ||
131 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, 258 !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
132 NULL, &fuse_ctl_abort_ops)) 259 NULL, &fuse_ctl_abort_ops) ||
260 !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
261 1, NULL, &fuse_conn_max_background_ops) ||
262 !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
263 S_IFREG | 0600, 1, NULL,
264 &fuse_conn_congestion_threshold_ops))
133 goto err; 265 goto err;
134 266
135 return 0; 267 return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
156 d_drop(dentry); 288 d_drop(dentry);
157 dput(dentry); 289 dput(dentry);
158 } 290 }
159 fuse_control_sb->s_root->d_inode->i_nlink--; 291 drop_nlink(fuse_control_sb->s_root->d_inode);
160} 292}
161 293
162static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) 294static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
250 250
251static void flush_bg_queue(struct fuse_conn *fc) 251static void flush_bg_queue(struct fuse_conn *fc)
252{ 252{
253 while (fc->active_background < FUSE_MAX_BACKGROUND && 253 while (fc->active_background < fc->max_background &&
254 !list_empty(&fc->bg_queue)) { 254 !list_empty(&fc->bg_queue)) {
255 struct fuse_req *req; 255 struct fuse_req *req;
256 256
@@ -280,11 +280,11 @@ __releases(&fc->lock)
280 list_del(&req->intr_entry); 280 list_del(&req->intr_entry);
281 req->state = FUSE_REQ_FINISHED; 281 req->state = FUSE_REQ_FINISHED;
282 if (req->background) { 282 if (req->background) {
283 if (fc->num_background == FUSE_MAX_BACKGROUND) { 283 if (fc->num_background == fc->max_background) {
284 fc->blocked = 0; 284 fc->blocked = 0;
285 wake_up_all(&fc->blocked_waitq); 285 wake_up_all(&fc->blocked_waitq);
286 } 286 }
287 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 287 if (fc->num_background == fc->congestion_threshold &&
288 fc->connected && fc->bdi_initialized) { 288 fc->connected && fc->bdi_initialized) {
289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); 289 clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 290 clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
410{ 410{
411 req->background = 1; 411 req->background = 1;
412 fc->num_background++; 412 fc->num_background++;
413 if (fc->num_background == FUSE_MAX_BACKGROUND) 413 if (fc->num_background == fc->max_background)
414 fc->blocked = 1; 414 fc->blocked = 1;
415 if (fc->num_background == FUSE_CONGESTION_THRESHOLD && 415 if (fc->num_background == fc->congestion_threshold &&
416 fc->bdi_initialized) { 416 fc->bdi_initialized) {
417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC); 417 set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); 418 set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
25/** Max number of pages that can be used in a single read request */ 25/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 26#define FUSE_MAX_PAGES_PER_REQ 32
27 27
28/** Maximum number of outstanding background requests */
29#define FUSE_MAX_BACKGROUND 12
30
31/** Congestion starts at 75% of maximum */
32#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
33
34/** Bias for fi->writectr, meaning new writepages must not be sent */ 28/** Bias for fi->writectr, meaning new writepages must not be sent */
35#define FUSE_NOWRITE INT_MIN 29#define FUSE_NOWRITE INT_MIN
36 30
@@ -38,7 +32,7 @@
38#define FUSE_NAME_MAX 1024 32#define FUSE_NAME_MAX 1024
39 33
40/** Number of dentries for each connection in the control filesystem */ 34/** Number of dentries for each connection in the control filesystem */
41#define FUSE_CTL_NUM_DENTRIES 3 35#define FUSE_CTL_NUM_DENTRIES 5
42 36
43/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem 37/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
44 module will check permissions based on the file mode. Otherwise no 38 module will check permissions based on the file mode. Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
55/** Global mutex protecting fuse_conn_list and the control filesystem */ 49/** Global mutex protecting fuse_conn_list and the control filesystem */
56extern struct mutex fuse_mutex; 50extern struct mutex fuse_mutex;
57 51
52/** Module parameters */
53extern unsigned max_user_bgreq;
54extern unsigned max_user_congthresh;
55
58/** FUSE inode */ 56/** FUSE inode */
59struct fuse_inode { 57struct fuse_inode {
60 /** Inode data */ 58 /** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
349 /** rbtree of fuse_files waiting for poll events indexed by ph */ 347 /** rbtree of fuse_files waiting for poll events indexed by ph */
350 struct rb_root polled_files; 348 struct rb_root polled_files;
351 349
350 /** Maximum number of outstanding background requests */
351 unsigned max_background;
352
353 /** Number of background requests at which congestion starts */
354 unsigned congestion_threshold;
355
352 /** Number of requests currently in the background */ 356 /** Number of requests currently in the background */
353 unsigned num_background; 357 unsigned num_background;
354 358
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
14#include <linux/seq_file.h> 14#include <linux/seq_file.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/moduleparam.h>
17#include <linux/parser.h> 18#include <linux/parser.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
19#include <linux/random.h> 20#include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
28struct list_head fuse_conn_list; 29struct list_head fuse_conn_list;
29DEFINE_MUTEX(fuse_mutex); 30DEFINE_MUTEX(fuse_mutex);
30 31
32static int set_global_limit(const char *val, struct kernel_param *kp);
33
34unsigned max_user_bgreq;
35module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
36 &max_user_bgreq, 0644);
37__MODULE_PARM_TYPE(max_user_bgreq, "uint");
38MODULE_PARM_DESC(max_user_bgreq,
39 "Global limit for the maximum number of backgrounded requests an "
40 "unprivileged user can set");
41
42unsigned max_user_congthresh;
43module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
44 &max_user_congthresh, 0644);
45__MODULE_PARM_TYPE(max_user_congthresh, "uint");
46MODULE_PARM_DESC(max_user_congthresh,
47 "Global limit for the maximum congestion threshold an "
48 "unprivileged user can set");
49
31#define FUSE_SUPER_MAGIC 0x65735546 50#define FUSE_SUPER_MAGIC 0x65735546
32 51
33#define FUSE_DEFAULT_BLKSIZE 512 52#define FUSE_DEFAULT_BLKSIZE 512
34 53
54/** Maximum number of outstanding background requests */
55#define FUSE_DEFAULT_MAX_BACKGROUND 12
56
57/** Congestion starts at 75% of maximum */
58#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
59
35struct fuse_mount_data { 60struct fuse_mount_data {
36 int fd; 61 int fd;
37 unsigned rootmode; 62 unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
517 INIT_LIST_HEAD(&fc->bg_queue); 542 INIT_LIST_HEAD(&fc->bg_queue);
518 INIT_LIST_HEAD(&fc->entry); 543 INIT_LIST_HEAD(&fc->entry);
519 atomic_set(&fc->num_waiting, 0); 544 atomic_set(&fc->num_waiting, 0);
545 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
546 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
520 fc->khctr = 0; 547 fc->khctr = 0;
521 fc->polled_files = RB_ROOT; 548 fc->polled_files = RB_ROOT;
522 fc->reqctr = 0; 549 fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
727 .show_options = fuse_show_options, 754 .show_options = fuse_show_options,
728}; 755};
729 756
757static void sanitize_global_limit(unsigned *limit)
758{
759 if (*limit == 0)
760 *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
761 sizeof(struct fuse_req);
762
763 if (*limit >= 1 << 16)
764 *limit = (1 << 16) - 1;
765}
766
767static int set_global_limit(const char *val, struct kernel_param *kp)
768{
769 int rv;
770
771 rv = param_set_uint(val, kp);
772 if (rv)
773 return rv;
774
775 sanitize_global_limit((unsigned *)kp->arg);
776
777 return 0;
778}
779
780static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
781{
782 int cap_sys_admin = capable(CAP_SYS_ADMIN);
783
784 if (arg->minor < 13)
785 return;
786
787 sanitize_global_limit(&max_user_bgreq);
788 sanitize_global_limit(&max_user_congthresh);
789
790 if (arg->max_background) {
791 fc->max_background = arg->max_background;
792
793 if (!cap_sys_admin && fc->max_background > max_user_bgreq)
794 fc->max_background = max_user_bgreq;
795 }
796 if (arg->congestion_threshold) {
797 fc->congestion_threshold = arg->congestion_threshold;
798
799 if (!cap_sys_admin &&
800 fc->congestion_threshold > max_user_congthresh)
801 fc->congestion_threshold = max_user_congthresh;
802 }
803}
804
730static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 805static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
731{ 806{
732 struct fuse_init_out *arg = &req->misc.init_out; 807 struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
736 else { 811 else {
737 unsigned long ra_pages; 812 unsigned long ra_pages;
738 813
814 process_init_limits(fc, arg);
815
739 if (arg->minor >= 6) { 816 if (arg->minor >= 6) {
740 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; 817 ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
741 if (arg->flags & FUSE_ASYNC_READ) 818 if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
801{ 878{
802 int err; 879 int err;
803 880
881 fc->bdi.name = "fuse";
804 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 882 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
805 fc->bdi.unplug_io_fn = default_unplug_io_fn; 883 fc->bdi.unplug_io_fn = default_unplug_io_fn;
806 /* fuse does it's own writeback accounting */ 884 /* fuse does it's own writeback accounting */
@@ -893,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
893 if (err) 971 if (err)
894 goto err_put_conn; 972 goto err_put_conn;
895 973
974 sb->s_bdi = &fc->bdi;
975
896 /* Handle umasking inside the fuse code */ 976 /* Handle umasking inside the fuse code */
897 if (sb->s_flags & MS_POSIXACL) 977 if (sb->s_flags & MS_POSIXACL)
898 fc->dont_mask = 1; 978 fc->dont_mask = 1;
@@ -1147,6 +1227,9 @@ static int __init fuse_init(void)
1147 if (res) 1227 if (res)
1148 goto err_sysfs_cleanup; 1228 goto err_sysfs_cleanup;
1149 1229
1230 sanitize_global_limit(&max_user_bgreq);
1231 sanitize_global_limit(&max_user_congthresh);
1232
1150 return 0; 1233 return 0;
1151 1234
1152 err_sysfs_cleanup: 1235 err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS := -I$(src) 1EXTRA_CFLAGS := -I$(src)
2obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
3gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
4 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
5 aops.o dentry.o export.o file.o \ 5 aops.o dentry.o export.o file.o \
6 ops_fstype.o ops_inode.o quota.o \ 6 ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
21#include "acl.h" 21#include "acl.h"
22#include "eaops.h" 22#include "xattr.h"
23#include "eattr.h"
24#include "glock.h" 23#include "glock.h"
25#include "inode.h" 24#include "inode.h"
26#include "meta_io.h" 25#include "meta_io.h"
@@ -31,8 +30,7 @@
31#define ACL_DEFAULT 0 30#define ACL_DEFAULT 0
32 31
33int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, 32int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
34 struct gfs2_ea_request *er, 33 struct gfs2_ea_request *er, int *remove, mode_t *mode)
35 int *remove, mode_t *mode)
36{ 34{
37 struct posix_acl *acl; 35 struct posix_acl *acl;
38 int error; 36 int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
83 return 0; 81 return 0;
84} 82}
85 83
86static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, 84static int acl_get(struct gfs2_inode *ip, const char *name,
87 struct gfs2_ea_location *el, char **data, unsigned int *len) 85 struct posix_acl **acl, struct gfs2_ea_location *el,
86 char **datap, unsigned int *lenp)
88{ 87{
89 struct gfs2_ea_request er; 88 char *data;
90 struct gfs2_ea_location el_this; 89 unsigned int len;
91 int error; 90 int error;
92 91
92 el->el_bh = NULL;
93
93 if (!ip->i_eattr) 94 if (!ip->i_eattr)
94 return 0; 95 return 0;
95 96
96 memset(&er, 0, sizeof(struct gfs2_ea_request)); 97 error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
97 if (access) {
98 er.er_name = GFS2_POSIX_ACL_ACCESS;
99 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
100 } else {
101 er.er_name = GFS2_POSIX_ACL_DEFAULT;
102 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
103 }
104 er.er_type = GFS2_EATYPE_SYS;
105
106 if (!el)
107 el = &el_this;
108
109 error = gfs2_ea_find(ip, &er, el);
110 if (error) 98 if (error)
111 return error; 99 return error;
112 if (!el->el_ea) 100 if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
114 if (!GFS2_EA_DATA_LEN(el->el_ea)) 102 if (!GFS2_EA_DATA_LEN(el->el_ea))
115 goto out; 103 goto out;
116 104
117 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); 105 len = GFS2_EA_DATA_LEN(el->el_ea);
118 er.er_data = kmalloc(er.er_data_len, GFP_NOFS); 106 data = kmalloc(len, GFP_NOFS);
119 error = -ENOMEM; 107 error = -ENOMEM;
120 if (!er.er_data) 108 if (!data)
121 goto out; 109 goto out;
122 110
123 error = gfs2_ea_get_copy(ip, el, er.er_data); 111 error = gfs2_ea_get_copy(ip, el, data, len);
124 if (error) 112 if (error < 0)
125 goto out_kfree; 113 goto out_kfree;
114 error = 0;
126 115
127 if (acl) { 116 if (acl) {
128 *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); 117 *acl = posix_acl_from_xattr(data, len);
129 if (IS_ERR(*acl)) 118 if (IS_ERR(*acl))
130 error = PTR_ERR(*acl); 119 error = PTR_ERR(*acl);
131 } 120 }
132 121
133out_kfree: 122out_kfree:
134 if (error || !data) 123 if (error || !datap) {
135 kfree(er.er_data); 124 kfree(data);
136 else { 125 } else {
137 *data = er.er_data; 126 *datap = data;
138 *len = er.er_data_len; 127 *lenp = len;
139 } 128 }
140out: 129out:
141 if (error || el == &el_this)
142 brelse(el->el_bh);
143 return error; 130 return error;
144} 131}
145 132
@@ -153,10 +140,12 @@ out:
153 140
154int gfs2_check_acl(struct inode *inode, int mask) 141int gfs2_check_acl(struct inode *inode, int mask)
155{ 142{
143 struct gfs2_ea_location el;
156 struct posix_acl *acl = NULL; 144 struct posix_acl *acl = NULL;
157 int error; 145 int error;
158 146
159 error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); 147 error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
148 brelse(el.el_bh);
160 if (error) 149 if (error)
161 return error; 150 return error;
162 151
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
196 185
197int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) 186int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
198{ 187{
188 struct gfs2_ea_location el;
199 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 189 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
200 struct posix_acl *acl = NULL, *clone; 190 struct posix_acl *acl = NULL, *clone;
201 struct gfs2_ea_request er;
202 mode_t mode = ip->i_inode.i_mode; 191 mode_t mode = ip->i_inode.i_mode;
192 char *data = NULL;
193 unsigned int len;
203 int error; 194 int error;
204 195
205 if (!sdp->sd_args.ar_posix_acl) 196 if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
207 if (S_ISLNK(ip->i_inode.i_mode)) 198 if (S_ISLNK(ip->i_inode.i_mode))
208 return 0; 199 return 0;
209 200
210 memset(&er, 0, sizeof(struct gfs2_ea_request)); 201 error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
211 er.er_type = GFS2_EATYPE_SYS; 202 brelse(el.el_bh);
212
213 error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
214 &er.er_data, &er.er_data_len);
215 if (error) 203 if (error)
216 return error; 204 return error;
217 if (!acl) { 205 if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
229 acl = clone; 217 acl = clone;
230 218
231 if (S_ISDIR(ip->i_inode.i_mode)) { 219 if (S_ISDIR(ip->i_inode.i_mode)) {
232 er.er_name = GFS2_POSIX_ACL_DEFAULT; 220 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
233 er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; 221 GFS2_POSIX_ACL_DEFAULT, data, len, 0);
234 error = gfs2_system_eaops.eo_set(ip, &er);
235 if (error) 222 if (error)
236 goto out; 223 goto out;
237 } 224 }
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
239 error = posix_acl_create_masq(acl, &mode); 226 error = posix_acl_create_masq(acl, &mode);
240 if (error < 0) 227 if (error < 0)
241 goto out; 228 goto out;
242 if (error > 0) { 229 if (error == 0)
243 er.er_name = GFS2_POSIX_ACL_ACCESS; 230 goto munge;
244 er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
245 posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
246 er.er_mode = mode;
247 er.er_flags = GFS2_ERF_MODE;
248 error = gfs2_system_eaops.eo_set(ip, &er);
249 if (error)
250 goto out;
251 } else
252 munge_mode(ip, mode);
253 231
232 posix_acl_to_xattr(acl, data, len);
233 error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
234 GFS2_POSIX_ACL_ACCESS, data, len, 0);
235 if (error)
236 goto out;
237munge:
238 error = munge_mode(ip, mode);
254out: 239out:
255 posix_acl_release(acl); 240 posix_acl_release(acl);
256 kfree(er.er_data); 241 kfree(data);
257 return error; 242 return error;
258} 243}
259 244
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
265 unsigned int len; 250 unsigned int len;
266 int error; 251 int error;
267 252
268 error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); 253 error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
269 if (error) 254 if (error)
270 return error; 255 goto out_brelse;
271 if (!acl) 256 if (!acl)
272 return gfs2_setattr_simple(ip, attr); 257 return gfs2_setattr_simple(ip, attr);
273 258
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
286 271
287out: 272out:
288 posix_acl_release(acl); 273 posix_acl_release(acl);
289 brelse(el.el_bh);
290 kfree(data); 274 kfree(data);
275out_brelse:
276 brelse(el.el_bh);
291 return error; 277 return error;
292} 278}
293 279
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
107 return 0; 107 return 0;
108} 108}
109 109
110static int gfs2_dentry_delete(struct dentry *dentry)
111{
112 struct gfs2_inode *ginode;
113
114 if (!dentry->d_inode)
115 return 0;
116
117 ginode = GFS2_I(dentry->d_inode);
118 if (!ginode->i_iopen_gh.gh_gl)
119 return 0;
120
121 if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
122 return 1;
123
124 return 0;
125}
126
110const struct dentry_operations gfs2_dops = { 127const struct dentry_operations gfs2_dops = {
111 .d_revalidate = gfs2_drevalidate, 128 .d_revalidate = gfs2_drevalidate,
112 .d_hash = gfs2_dhash, 129 .d_hash = gfs2_dhash,
130 .d_delete = gfs2_dentry_delete,
113}; 131};
114 132
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/capability.h>
15#include <linux/xattr.h>
16#include <linux/gfs2_ondisk.h>
17#include <asm/uaccess.h>
18
19#include "gfs2.h"
20#include "incore.h"
21#include "acl.h"
22#include "eaops.h"
23#include "eattr.h"
24#include "util.h"
25
26/**
27 * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
28 * @namep: ea name, possibly with type appended
29 *
30 * Returns: GFS2_EATYPE_XXX
31 */
32
33unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
34{
35 unsigned int type;
36
37 if (strncmp(name, "system.", 7) == 0) {
38 type = GFS2_EATYPE_SYS;
39 if (truncated_name)
40 *truncated_name = name + sizeof("system.") - 1;
41 } else if (strncmp(name, "user.", 5) == 0) {
42 type = GFS2_EATYPE_USR;
43 if (truncated_name)
44 *truncated_name = name + sizeof("user.") - 1;
45 } else if (strncmp(name, "security.", 9) == 0) {
46 type = GFS2_EATYPE_SECURITY;
47 if (truncated_name)
48 *truncated_name = name + sizeof("security.") - 1;
49 } else {
50 type = GFS2_EATYPE_UNUSED;
51 if (truncated_name)
52 *truncated_name = NULL;
53 }
54
55 return type;
56}
57
58static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
59{
60 if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
61 !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
62 !capable(CAP_SYS_ADMIN))
63 return -EPERM;
64
65 if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
66 (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
67 GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
68 return -EOPNOTSUPP;
69
70 return gfs2_ea_get_i(ip, er);
71}
72
73static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
74{
75 int remove = 0;
76 int error;
77
78 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
79 if (!(er->er_flags & GFS2_ERF_MODE)) {
80 er->er_mode = ip->i_inode.i_mode;
81 er->er_flags |= GFS2_ERF_MODE;
82 }
83 error = gfs2_acl_validate_set(ip, 1, er,
84 &remove, &er->er_mode);
85 if (error)
86 return error;
87 error = gfs2_ea_set_i(ip, er);
88 if (error)
89 return error;
90 if (remove)
91 gfs2_ea_remove_i(ip, er);
92 return 0;
93
94 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
95 error = gfs2_acl_validate_set(ip, 0, er,
96 &remove, NULL);
97 if (error)
98 return error;
99 if (!remove)
100 error = gfs2_ea_set_i(ip, er);
101 else {
102 error = gfs2_ea_remove_i(ip, er);
103 if (error == -ENODATA)
104 error = 0;
105 }
106 return error;
107 }
108
109 return -EPERM;
110}
111
112static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
113{
114 if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
115 int error = gfs2_acl_validate_remove(ip, 1);
116 if (error)
117 return error;
118
119 } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
120 int error = gfs2_acl_validate_remove(ip, 0);
121 if (error)
122 return error;
123
124 } else
125 return -EPERM;
126
127 return gfs2_ea_remove_i(ip, er);
128}
129
130static const struct gfs2_eattr_operations gfs2_user_eaops = {
131 .eo_get = gfs2_ea_get_i,
132 .eo_set = gfs2_ea_set_i,
133 .eo_remove = gfs2_ea_remove_i,
134 .eo_name = "user",
135};
136
137const struct gfs2_eattr_operations gfs2_system_eaops = {
138 .eo_get = system_eo_get,
139 .eo_set = system_eo_set,
140 .eo_remove = system_eo_remove,
141 .eo_name = "system",
142};
143
144static const struct gfs2_eattr_operations gfs2_security_eaops = {
145 .eo_get = gfs2_ea_get_i,
146 .eo_set = gfs2_ea_set_i,
147 .eo_remove = gfs2_ea_remove_i,
148 .eo_name = "security",
149};
150
151const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
152 NULL,
153 &gfs2_user_eaops,
154 &gfs2_system_eaops,
155 &gfs2_security_eaops,
156};
157
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __EAOPS_DOT_H__
11#define __EAOPS_DOT_H__
12
13struct gfs2_ea_request;
14struct gfs2_inode;
15
16struct gfs2_eattr_operations {
17 int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
18 int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
19 int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
20 char *eo_name;
21};
22
23unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
24
25extern const struct gfs2_eattr_operations gfs2_system_eaops;
26
27extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
28
29#endif /* __EAOPS_DOT_H__ */
30
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
143} 143}
144 144
145static struct dentry *gfs2_get_dentry(struct super_block *sb, 145static struct dentry *gfs2_get_dentry(struct super_block *sb,
146 struct gfs2_inum_host *inum) 146 struct gfs2_inum_host *inum)
147{ 147{
148 struct gfs2_sbd *sdp = sb->s_fs_info; 148 struct gfs2_sbd *sdp = sb->s_fs_info;
149 struct gfs2_holder i_gh, ri_gh, rgd_gh; 149 struct gfs2_holder i_gh;
150 struct gfs2_rgrpd *rgd;
151 struct inode *inode; 150 struct inode *inode;
152 struct dentry *dentry; 151 struct dentry *dentry;
153 int error; 152 int error;
154 153
155 /* System files? */
156
157 inode = gfs2_ilookup(sb, inum->no_addr); 154 inode = gfs2_ilookup(sb, inum->no_addr);
158 if (inode) { 155 if (inode) {
159 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { 156 if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
168 if (error) 165 if (error)
169 return ERR_PTR(error); 166 return ERR_PTR(error);
170 167
171 error = gfs2_rindex_hold(sdp, &ri_gh); 168 error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
172 if (error) 169 if (error)
173 goto fail; 170 goto fail;
174 171
175 error = -EINVAL; 172 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
176 rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
177 if (!rgd)
178 goto fail_rindex;
179
180 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
181 if (error)
182 goto fail_rindex;
183
184 error = -ESTALE;
185 if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
186 goto fail_rgd;
187
188 gfs2_glock_dq_uninit(&rgd_gh);
189 gfs2_glock_dq_uninit(&ri_gh);
190
191 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
192 inum->no_addr,
193 0, 0);
194 if (IS_ERR(inode)) { 173 if (IS_ERR(inode)) {
195 error = PTR_ERR(inode); 174 error = PTR_ERR(inode);
196 goto fail; 175 goto fail;
@@ -224,13 +203,6 @@ out_inode:
224 if (!IS_ERR(dentry)) 203 if (!IS_ERR(dentry))
225 dentry->d_op = &gfs2_dops; 204 dentry->d_op = &gfs2_dops;
226 return dentry; 205 return dentry;
227
228fail_rgd:
229 gfs2_glock_dq_uninit(&rgd_gh);
230
231fail_rindex:
232 gfs2_glock_dq_uninit(&ri_gh);
233
234fail: 206fail:
235 gfs2_glock_dq_uninit(&i_gh); 207 gfs2_glock_dq_uninit(&i_gh);
236 return ERR_PTR(error); 208 return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
38#include "rgrp.h" 38#include "rgrp.h"
39#include "trans.h" 39#include "trans.h"
40#include "util.h" 40#include "util.h"
41#include "eaops.h"
42 41
43/** 42/**
44 * gfs2_llseek - seek to a location in a file 43 * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 61801ada36f0..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -406,6 +406,12 @@ struct gfs2_statfs_change_host {
406#define GFS2_DATA_WRITEBACK 1 406#define GFS2_DATA_WRITEBACK 1
407#define GFS2_DATA_ORDERED 2 407#define GFS2_DATA_ORDERED 2
408 408
409#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW
410#define GFS2_ERRORS_WITHDRAW 0
411#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */
412#define GFS2_ERRORS_RO 2 /* place holder for future feature */
413#define GFS2_ERRORS_PANIC 3
414
409struct gfs2_args { 415struct gfs2_args {
410 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ 416 char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
411 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 417 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
@@ -422,6 +428,7 @@ struct gfs2_args {
422 unsigned int ar_data:2; /* ordered/writeback */ 428 unsigned int ar_data:2; /* ordered/writeback */
423 unsigned int ar_meta:1; /* mount metafs */ 429 unsigned int ar_meta:1; /* mount metafs */
424 unsigned int ar_discard:1; /* discard requests */ 430 unsigned int ar_discard:1; /* discard requests */
431 unsigned int ar_errors:2; /* errors=withdraw | panic */
425 int ar_commit; /* Commit interval */ 432 int ar_commit; /* Commit interval */
426}; 433};
427 434
@@ -489,7 +496,6 @@ struct gfs2_sb_host {
489 */ 496 */
490 497
491struct lm_lockstruct { 498struct lm_lockstruct {
492 u32 ls_id;
493 unsigned int ls_jid; 499 unsigned int ls_jid;
494 unsigned int ls_first; 500 unsigned int ls_first;
495 unsigned int ls_first_done; 501 unsigned int ls_first_done;
@@ -541,18 +547,12 @@ struct gfs2_sbd {
541 struct dentry *sd_root_dir; 547 struct dentry *sd_root_dir;
542 548
543 struct inode *sd_jindex; 549 struct inode *sd_jindex;
544 struct inode *sd_inum_inode;
545 struct inode *sd_statfs_inode; 550 struct inode *sd_statfs_inode;
546 struct inode *sd_ir_inode;
547 struct inode *sd_sc_inode; 551 struct inode *sd_sc_inode;
548 struct inode *sd_qc_inode; 552 struct inode *sd_qc_inode;
549 struct inode *sd_rindex; 553 struct inode *sd_rindex;
550 struct inode *sd_quota_inode; 554 struct inode *sd_quota_inode;
551 555
552 /* Inum stuff */
553
554 struct mutex sd_inum_mutex;
555
556 /* StatFS stuff */ 556 /* StatFS stuff */
557 557
558 spinlock_t sd_statfs_spin; 558 spinlock_t sd_statfs_spin;
@@ -580,7 +580,6 @@ struct gfs2_sbd {
580 struct gfs2_holder sd_journal_gh; 580 struct gfs2_holder sd_journal_gh;
581 struct gfs2_holder sd_jinode_gh; 581 struct gfs2_holder sd_jinode_gh;
582 582
583 struct gfs2_holder sd_ir_gh;
584 struct gfs2_holder sd_sc_gh; 583 struct gfs2_holder sd_sc_gh;
585 struct gfs2_holder sd_qc_gh; 584 struct gfs2_holder sd_qc_gh;
586 585
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
24#include "acl.h" 24#include "acl.h"
25#include "bmap.h" 25#include "bmap.h"
26#include "dir.h" 26#include "dir.h"
27#include "eattr.h" 27#include "xattr.h"
28#include "glock.h" 28#include "glock.h"
29#include "glops.h" 29#include "glops.h"
30#include "inode.h" 30#include "inode.h"
@@ -519,139 +519,6 @@ out:
519 return inode ? inode : ERR_PTR(error); 519 return inode ? inode : ERR_PTR(error);
520} 520}
521 521
522static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
523{
524 const struct gfs2_inum_range *str = buf;
525
526 ir->ir_start = be64_to_cpu(str->ir_start);
527 ir->ir_length = be64_to_cpu(str->ir_length);
528}
529
530static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
531{
532 struct gfs2_inum_range *str = buf;
533
534 str->ir_start = cpu_to_be64(ir->ir_start);
535 str->ir_length = cpu_to_be64(ir->ir_length);
536}
537
538static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
539{
540 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
541 struct buffer_head *bh;
542 struct gfs2_inum_range_host ir;
543 int error;
544
545 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
546 if (error)
547 return error;
548 mutex_lock(&sdp->sd_inum_mutex);
549
550 error = gfs2_meta_inode_buffer(ip, &bh);
551 if (error) {
552 mutex_unlock(&sdp->sd_inum_mutex);
553 gfs2_trans_end(sdp);
554 return error;
555 }
556
557 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
558
559 if (ir.ir_length) {
560 *formal_ino = ir.ir_start++;
561 ir.ir_length--;
562 gfs2_trans_add_bh(ip->i_gl, bh, 1);
563 gfs2_inum_range_out(&ir,
564 bh->b_data + sizeof(struct gfs2_dinode));
565 brelse(bh);
566 mutex_unlock(&sdp->sd_inum_mutex);
567 gfs2_trans_end(sdp);
568 return 0;
569 }
570
571 brelse(bh);
572
573 mutex_unlock(&sdp->sd_inum_mutex);
574 gfs2_trans_end(sdp);
575
576 return 1;
577}
578
579static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
580{
581 struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
582 struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
583 struct gfs2_holder gh;
584 struct buffer_head *bh;
585 struct gfs2_inum_range_host ir;
586 int error;
587
588 error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
589 if (error)
590 return error;
591
592 error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
593 if (error)
594 goto out;
595 mutex_lock(&sdp->sd_inum_mutex);
596
597 error = gfs2_meta_inode_buffer(ip, &bh);
598 if (error)
599 goto out_end_trans;
600
601 gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
602
603 if (!ir.ir_length) {
604 struct buffer_head *m_bh;
605 u64 x, y;
606 __be64 z;
607
608 error = gfs2_meta_inode_buffer(m_ip, &m_bh);
609 if (error)
610 goto out_brelse;
611
612 z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
613 x = y = be64_to_cpu(z);
614 ir.ir_start = x;
615 ir.ir_length = GFS2_INUM_QUANTUM;
616 x += GFS2_INUM_QUANTUM;
617 if (x < y)
618 gfs2_consist_inode(m_ip);
619 z = cpu_to_be64(x);
620 gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
621 *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
622
623 brelse(m_bh);
624 }
625
626 *formal_ino = ir.ir_start++;
627 ir.ir_length--;
628
629 gfs2_trans_add_bh(ip->i_gl, bh, 1);
630 gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
631
632out_brelse:
633 brelse(bh);
634out_end_trans:
635 mutex_unlock(&sdp->sd_inum_mutex);
636 gfs2_trans_end(sdp);
637out:
638 gfs2_glock_dq_uninit(&gh);
639 return error;
640}
641
642static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
643{
644 int error;
645
646 error = pick_formal_ino_1(sdp, inum);
647 if (error <= 0)
648 return error;
649
650 error = pick_formal_ino_2(sdp, inum);
651
652 return error;
653}
654
655/** 522/**
656 * create_ok - OK to create a new on-disk inode here? 523 * create_ok - OK to create a new on-disk inode here?
657 * @dip: Directory in which dinode is to be created 524 * @dip: Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
731 if (error) 598 if (error)
732 goto out_ipreserv; 599 goto out_ipreserv;
733 600
734 *no_addr = gfs2_alloc_di(dip, generation); 601 error = gfs2_alloc_di(dip, no_addr, generation);
735 602
736 gfs2_trans_end(sdp); 603 gfs2_trans_end(sdp);
737 604
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
924 size_t len; 791 size_t len;
925 void *value; 792 void *value;
926 char *name; 793 char *name;
927 struct gfs2_ea_request er;
928 794
929 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, 795 err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
930 &name, &value, &len); 796 &name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
935 return err; 801 return err;
936 } 802 }
937 803
938 memset(&er, 0, sizeof(struct gfs2_ea_request)); 804 err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
939
940 er.er_type = GFS2_EATYPE_SECURITY;
941 er.er_name = name;
942 er.er_data = value;
943 er.er_name_len = strlen(name);
944 er.er_data_len = len;
945
946 err = gfs2_ea_set_i(ip, &er);
947
948 kfree(value); 805 kfree(value);
949 kfree(name); 806 kfree(name);
950 807
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
991 if (error) 848 if (error)
992 goto fail_gunlock; 849 goto fail_gunlock;
993 850
994 error = pick_formal_ino(sdp, &inum.no_formal_ino);
995 if (error)
996 goto fail_gunlock;
997
998 error = alloc_dinode(dip, &inum.no_addr, &generation); 851 error = alloc_dinode(dip, &inum.no_addr, &generation);
999 if (error) 852 if (error)
1000 goto fail_gunlock; 853 goto fail_gunlock;
854 inum.no_formal_ino = generation;
1001 855
1002 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, 856 error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
1003 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); 857 LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
1008 if (error) 862 if (error)
1009 goto fail_gunlock2; 863 goto fail_gunlock2;
1010 864
1011 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), 865 inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
1012 inum.no_addr, 866 inum.no_formal_ino, 0);
1013 inum.no_formal_ino, 0);
1014 if (IS_ERR(inode)) 867 if (IS_ERR(inode))
1015 goto fail_gunlock2; 868 goto fail_gunlock2;
1016 869
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
84 84
85 gfs2_tune_init(&sdp->sd_tune); 85 gfs2_tune_init(&sdp->sd_tune);
86 86
87 mutex_init(&sdp->sd_inum_mutex);
88 spin_lock_init(&sdp->sd_statfs_spin); 87 spin_lock_init(&sdp->sd_statfs_spin);
89 88
90 spin_lock_init(&sdp->sd_rindex_spin); 89 spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
833 if (error) 832 if (error)
834 goto fail; 833 goto fail;
835 834
836 /* Read in the master inode number inode */
837 sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
838 if (IS_ERR(sdp->sd_inum_inode)) {
839 error = PTR_ERR(sdp->sd_inum_inode);
840 fs_err(sdp, "can't read in inum inode: %d\n", error);
841 goto fail_journal;
842 }
843
844
845 /* Read in the master statfs inode */ 835 /* Read in the master statfs inode */
846 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); 836 sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
847 if (IS_ERR(sdp->sd_statfs_inode)) { 837 if (IS_ERR(sdp->sd_statfs_inode)) {
848 error = PTR_ERR(sdp->sd_statfs_inode); 838 error = PTR_ERR(sdp->sd_statfs_inode);
849 fs_err(sdp, "can't read in statfs inode: %d\n", error); 839 fs_err(sdp, "can't read in statfs inode: %d\n", error);
850 goto fail_inum; 840 goto fail_journal;
851 } 841 }
852 842
853 /* Read in the resource index inode */ 843 /* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
876 iput(sdp->sd_rindex); 866 iput(sdp->sd_rindex);
877fail_statfs: 867fail_statfs:
878 iput(sdp->sd_statfs_inode); 868 iput(sdp->sd_statfs_inode);
879fail_inum:
880 iput(sdp->sd_inum_inode);
881fail_journal: 869fail_journal:
882 init_journal(sdp, UNDO); 870 init_journal(sdp, UNDO);
883fail: 871fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
905 return error; 893 return error;
906 } 894 }
907 895
908 sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
909 sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
910 if (IS_ERR(sdp->sd_ir_inode)) {
911 error = PTR_ERR(sdp->sd_ir_inode);
912 fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
913 goto fail;
914 }
915
916 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); 896 sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
917 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); 897 sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
918 if (IS_ERR(sdp->sd_sc_inode)) { 898 if (IS_ERR(sdp->sd_sc_inode)) {
919 error = PTR_ERR(sdp->sd_sc_inode); 899 error = PTR_ERR(sdp->sd_sc_inode);
920 fs_err(sdp, "can't find local \"sc\" file: %d\n", error); 900 fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
921 goto fail_ir_i; 901 goto fail;
922 } 902 }
923 903
924 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); 904 sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
932 iput(pn); 912 iput(pn);
933 pn = NULL; 913 pn = NULL;
934 914
935 ip = GFS2_I(sdp->sd_ir_inode);
936 error = gfs2_glock_nq_init(ip->i_gl,
937 LM_ST_EXCLUSIVE, 0,
938 &sdp->sd_ir_gh);
939 if (error) {
940 fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
941 goto fail_qc_i;
942 }
943
944 ip = GFS2_I(sdp->sd_sc_inode); 915 ip = GFS2_I(sdp->sd_sc_inode);
945 error = gfs2_glock_nq_init(ip->i_gl, 916 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
946 LM_ST_EXCLUSIVE, 0,
947 &sdp->sd_sc_gh); 917 &sdp->sd_sc_gh);
948 if (error) { 918 if (error) {
949 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); 919 fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
950 goto fail_ir_gh; 920 goto fail_qc_i;
951 } 921 }
952 922
953 ip = GFS2_I(sdp->sd_qc_inode); 923 ip = GFS2_I(sdp->sd_qc_inode);
954 error = gfs2_glock_nq_init(ip->i_gl, 924 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
955 LM_ST_EXCLUSIVE, 0,
956 &sdp->sd_qc_gh); 925 &sdp->sd_qc_gh);
957 if (error) { 926 if (error) {
958 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); 927 fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
965 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 934 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
966fail_ut_gh: 935fail_ut_gh:
967 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 936 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
968fail_ir_gh:
969 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
970fail_qc_i: 937fail_qc_i:
971 iput(sdp->sd_qc_inode); 938 iput(sdp->sd_qc_inode);
972fail_ut_i: 939fail_ut_i:
973 iput(sdp->sd_sc_inode); 940 iput(sdp->sd_sc_inode);
974fail_ir_i:
975 iput(sdp->sd_ir_inode);
976fail: 941fail:
977 if (pn) 942 if (pn)
978 iput(pn); 943 iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1063 1028
1064 ls->ls_ops = lm; 1029 ls->ls_ops = lm;
1065 ls->ls_first = 1; 1030 ls->ls_first = 1;
1066 ls->ls_id = 0;
1067 1031
1068 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { 1032 for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
1069 substring_t tmp[MAX_OPT_ARGS]; 1033 substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1081 ls->ls_jid = option; 1045 ls->ls_jid = option;
1082 break; 1046 break;
1083 case Opt_id: 1047 case Opt_id:
1084 ret = match_int(&tmp[0], &option); 1048 /* Obsolete, but left for backward compat purposes */
1085 if (ret)
1086 goto hostdata_error;
1087 ls->ls_id = option;
1088 break; 1049 break;
1089 case Opt_first: 1050 case Opt_first:
1090 ret = match_int(&tmp[0], &option); 1051 ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1133 lm->lm_unmount(sdp); 1094 lm->lm_unmount(sdp);
1134} 1095}
1135 1096
1097void gfs2_online_uevent(struct gfs2_sbd *sdp)
1098{
1099 struct super_block *sb = sdp->sd_vfs;
1100 char ro[20];
1101 char spectator[20];
1102 char *envp[] = { ro, spectator, NULL };
1103 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
1104 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
1105 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
1106}
1107
1136/** 1108/**
1137 * fill_super - Read in superblock 1109 * fill_super - Read in superblock
1138 * @sb: The VFS superblock 1110 * @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1157 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; 1129 sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
1158 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; 1130 sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
1159 sdp->sd_args.ar_commit = 60; 1131 sdp->sd_args.ar_commit = 60;
1132 sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
1160 1133
1161 error = gfs2_mount_args(sdp, &sdp->sd_args, data); 1134 error = gfs2_mount_args(sdp, &sdp->sd_args, data);
1162 if (error) { 1135 if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1174 sb->s_magic = GFS2_MAGIC; 1147 sb->s_magic = GFS2_MAGIC;
1175 sb->s_op = &gfs2_super_ops; 1148 sb->s_op = &gfs2_super_ops;
1176 sb->s_export_op = &gfs2_export_ops; 1149 sb->s_export_op = &gfs2_export_ops;
1150 sb->s_xattr = gfs2_xattr_handlers;
1177 sb->s_time_gran = 1; 1151 sb->s_time_gran = 1;
1178 sb->s_maxbytes = MAX_LFS_FILESIZE; 1152 sb->s_maxbytes = MAX_LFS_FILESIZE;
1179 1153
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
1236 } 1210 }
1237 1211
1238 gfs2_glock_dq_uninit(&mount_gh); 1212 gfs2_glock_dq_uninit(&mount_gh);
1239 1213 gfs2_online_uevent(sdp);
1240 return 0; 1214 return 0;
1241 1215
1242fail_threads: 1216fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
26#include "acl.h" 26#include "acl.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "dir.h" 28#include "dir.h"
29#include "eaops.h" 29#include "xattr.h"
30#include "eattr.h"
31#include "glock.h" 30#include "glock.h"
32#include "inode.h" 31#include "inode.h"
33#include "meta_io.h" 32#include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
349 348
350 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); 349 error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
351 if (error) 350 if (error)
352 goto out_rgrp; 351 goto out_gunlock;
353 352
354 error = gfs2_dir_del(dip, &dentry->d_name); 353 error = gfs2_dir_del(dip, &dentry->d_name);
355 if (error) 354 if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
1302 const void *data, size_t size, int flags) 1301 const void *data, size_t size, int flags)
1303{ 1302{
1304 struct inode *inode = dentry->d_inode; 1303 struct inode *inode = dentry->d_inode;
1305 struct gfs2_ea_request er; 1304 struct gfs2_inode *ip = GFS2_I(inode);
1306 1305 struct gfs2_holder gh;
1307 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1306 int ret;
1308 er.er_type = gfs2_ea_name2type(name, &er.er_name);
1309 if (er.er_type == GFS2_EATYPE_UNUSED)
1310 return -EOPNOTSUPP;
1311 er.er_data = (char *)data;
1312 er.er_name_len = strlen(er.er_name);
1313 er.er_data_len = size;
1314 er.er_flags = flags;
1315
1316 gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
1317 1307
1318 return gfs2_ea_set(GFS2_I(inode), &er); 1308 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1309 ret = gfs2_glock_nq(&gh);
1310 if (ret == 0) {
1311 ret = generic_setxattr(dentry, name, data, size, flags);
1312 gfs2_glock_dq(&gh);
1313 }
1314 gfs2_holder_uninit(&gh);
1315 return ret;
1319} 1316}
1320 1317
1321static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, 1318static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
1322 void *data, size_t size) 1319 void *data, size_t size)
1323{ 1320{
1324 struct gfs2_ea_request er; 1321 struct inode *inode = dentry->d_inode;
1325 1322 struct gfs2_inode *ip = GFS2_I(inode);
1326 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1323 struct gfs2_holder gh;
1327 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1324 int ret;
1328 if (er.er_type == GFS2_EATYPE_UNUSED)
1329 return -EOPNOTSUPP;
1330 er.er_data = data;
1331 er.er_name_len = strlen(er.er_name);
1332 er.er_data_len = size;
1333
1334 return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
1335}
1336
1337static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
1338{
1339 struct gfs2_ea_request er;
1340
1341 memset(&er, 0, sizeof(struct gfs2_ea_request));
1342 er.er_data = (size) ? buffer : NULL;
1343 er.er_data_len = size;
1344 1325
1345 return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); 1326 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1327 ret = gfs2_glock_nq(&gh);
1328 if (ret == 0) {
1329 ret = generic_getxattr(dentry, name, data, size);
1330 gfs2_glock_dq(&gh);
1331 }
1332 gfs2_holder_uninit(&gh);
1333 return ret;
1346} 1334}
1347 1335
1348static int gfs2_removexattr(struct dentry *dentry, const char *name) 1336static int gfs2_removexattr(struct dentry *dentry, const char *name)
1349{ 1337{
1350 struct gfs2_ea_request er; 1338 struct inode *inode = dentry->d_inode;
1351 1339 struct gfs2_inode *ip = GFS2_I(inode);
1352 memset(&er, 0, sizeof(struct gfs2_ea_request)); 1340 struct gfs2_holder gh;
1353 er.er_type = gfs2_ea_name2type(name, &er.er_name); 1341 int ret;
1354 if (er.er_type == GFS2_EATYPE_UNUSED)
1355 return -EOPNOTSUPP;
1356 er.er_name_len = strlen(er.er_name);
1357 1342
1358 return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); 1343 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
1344 ret = gfs2_glock_nq(&gh);
1345 if (ret == 0) {
1346 ret = generic_removexattr(dentry, name);
1347 gfs2_glock_dq(&gh);
1348 }
1349 gfs2_holder_uninit(&gh);
1350 return ret;
1359} 1351}
1360 1352
1361static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1353static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fba795798d3a..28c590b7c9da 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -857,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
857 goto start_new_extent; 857 goto start_new_extent;
858 if ((start + nr_sects) != blk) { 858 if ((start + nr_sects) != blk) {
859 rv = blkdev_issue_discard(bdev, start, 859 rv = blkdev_issue_discard(bdev, start,
860 nr_sects, GFP_NOFS); 860 nr_sects, GFP_NOFS,
861 DISCARD_FL_BARRIER);
861 if (rv) 862 if (rv)
862 goto fail; 863 goto fail;
863 nr_sects = 0; 864 nr_sects = 0;
@@ -871,7 +872,8 @@ start_new_extent:
871 } 872 }
872 } 873 }
873 if (nr_sects) { 874 if (nr_sects) {
874 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); 875 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
876 DISCARD_FL_BARRIER);
875 if (rv) 877 if (rv)
876 goto fail; 878 goto fail;
877 } 879 }
@@ -1256,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1256 * Returns: The block type (GFS2_BLKST_*) 1258 * Returns: The block type (GFS2_BLKST_*)
1257 */ 1259 */
1258 1260
1259unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) 1261static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1260{ 1262{
1261 struct gfs2_bitmap *bi = NULL; 1263 struct gfs2_bitmap *bi = NULL;
1262 u32 length, rgrp_block, buf_block; 1264 u32 length, rgrp_block, buf_block;
@@ -1459,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1459 return 0; 1461 return 0;
1460} 1462}
1461 1463
1464static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
1465{
1466 struct gfs2_sbd *sdp = rgd->rd_sbd;
1467 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
1468 (unsigned long long)rgd->rd_addr);
1469 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1470 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1471 rgd->rd_flags |= GFS2_RDF_ERROR;
1472}
1473
1462/** 1474/**
1463 * gfs2_alloc_block - Allocate one or more blocks 1475 * gfs2_alloc_block - Allocate one or more blocks
1464 * @ip: the inode to allocate the block for 1476 * @ip: the inode to allocate the block for
@@ -1520,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1520 return 0; 1532 return 0;
1521 1533
1522rgrp_error: 1534rgrp_error:
1523 fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", 1535 gfs2_rgrp_error(rgd);
1524 (unsigned long long)rgd->rd_addr);
1525 fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
1526 gfs2_rgrp_dump(NULL, rgd->rd_gl);
1527 rgd->rd_flags |= GFS2_RDF_ERROR;
1528 return -EIO; 1536 return -EIO;
1529} 1537}
1530 1538
1531/** 1539/**
1532 * gfs2_alloc_di - Allocate a dinode 1540 * gfs2_alloc_di - Allocate a dinode
1533 * @dip: the directory that the inode is going in 1541 * @dip: the directory that the inode is going in
1542 * @bn: the block number which is allocated
1543 * @generation: the generation number of the inode
1534 * 1544 *
1535 * Returns: the block allocated 1545 * Returns: 0 on success or error
1536 */ 1546 */
1537 1547
1538u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) 1548int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
1539{ 1549{
1540 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 1550 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
1541 struct gfs2_alloc *al = dip->i_alloc; 1551 struct gfs2_alloc *al = dip->i_alloc;
@@ -1546,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1546 1556
1547 blk = rgblk_search(rgd, rgd->rd_last_alloc, 1557 blk = rgblk_search(rgd, rgd->rd_last_alloc,
1548 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); 1558 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
1549 BUG_ON(blk == BFITNOENT);
1550 1559
1551 rgd->rd_last_alloc = blk; 1560 /* Since all blocks are reserved in advance, this shouldn't happen */
1561 if (blk == BFITNOENT)
1562 goto rgrp_error;
1552 1563
1564 rgd->rd_last_alloc = blk;
1553 block = rgd->rd_data0 + blk; 1565 block = rgd->rd_data0 + blk;
1566 if (rgd->rd_free == 0)
1567 goto rgrp_error;
1554 1568
1555 gfs2_assert_withdraw(sdp, rgd->rd_free);
1556 rgd->rd_free--; 1569 rgd->rd_free--;
1557 rgd->rd_dinodes++; 1570 rgd->rd_dinodes++;
1558 *generation = rgd->rd_igeneration++; 1571 *generation = rgd->rd_igeneration++;
1572 if (*generation == 0)
1573 *generation = rgd->rd_igeneration++;
1559 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1574 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1560 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1575 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1561 1576
@@ -1568,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1568 rgd->rd_free_clone--; 1583 rgd->rd_free_clone--;
1569 spin_unlock(&sdp->sd_rindex_spin); 1584 spin_unlock(&sdp->sd_rindex_spin);
1570 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); 1585 trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
1571 return block; 1586 *bn = block;
1587 return 0;
1588
1589rgrp_error:
1590 gfs2_rgrp_error(rgd);
1591 return -EIO;
1572} 1592}
1573 1593
1574/** 1594/**
@@ -1676,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1676} 1696}
1677 1697
1678/** 1698/**
1699 * gfs2_check_blk_type - Check the type of a block
1700 * @sdp: The superblock
1701 * @no_addr: The block number to check
1702 * @type: The block type we are looking for
1703 *
1704 * Returns: 0 if the block type matches the expected type
1705 * -ESTALE if it doesn't match
1706 * or -ve errno if something went wrong while checking
1707 */
1708
1709int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
1710{
1711 struct gfs2_rgrpd *rgd;
1712 struct gfs2_holder ri_gh, rgd_gh;
1713 int error;
1714
1715 error = gfs2_rindex_hold(sdp, &ri_gh);
1716 if (error)
1717 goto fail;
1718
1719 error = -EINVAL;
1720 rgd = gfs2_blk2rgrpd(sdp, no_addr);
1721 if (!rgd)
1722 goto fail_rindex;
1723
1724 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
1725 if (error)
1726 goto fail_rindex;
1727
1728 if (gfs2_get_block_type(rgd, no_addr) != type)
1729 error = -ESTALE;
1730
1731 gfs2_glock_dq_uninit(&rgd_gh);
1732fail_rindex:
1733 gfs2_glock_dq_uninit(&ri_gh);
1734fail:
1735 return error;
1736}
1737
1738/**
1679 * gfs2_rlist_add - add a RG to a list of RGs 1739 * gfs2_rlist_add - add a RG to a list of RGs
1680 * @sdp: the filesystem 1740 * @sdp: the filesystem
1681 * @rlist: the list of resource groups 1741 * @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
44 44
45extern void gfs2_inplace_release(struct gfs2_inode *ip); 45extern void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48
49extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 47extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
50extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 48extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
51 49
52extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 50extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
53extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 51extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
54extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 52extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
55extern void gfs2_unlink_di(struct inode *inode); 53extern void gfs2_unlink_di(struct inode *inode);
54extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
55 unsigned int type);
56 56
57struct gfs2_rgrp_list { 57struct gfs2_rgrp_list {
58 unsigned int rl_rgrps; 58 unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index f522bb017973..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
38#include "trans.h" 38#include "trans.h"
39#include "util.h" 39#include "util.h"
40#include "sys.h" 40#include "sys.h"
41#include "eattr.h" 41#include "xattr.h"
42 42
43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) 43#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
44 44
@@ -68,6 +68,8 @@ enum {
68 Opt_discard, 68 Opt_discard,
69 Opt_nodiscard, 69 Opt_nodiscard,
70 Opt_commit, 70 Opt_commit,
71 Opt_err_withdraw,
72 Opt_err_panic,
71 Opt_error, 73 Opt_error,
72}; 74};
73 75
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
97 {Opt_discard, "discard"}, 99 {Opt_discard, "discard"},
98 {Opt_nodiscard, "nodiscard"}, 100 {Opt_nodiscard, "nodiscard"},
99 {Opt_commit, "commit=%d"}, 101 {Opt_commit, "commit=%d"},
102 {Opt_err_withdraw, "errors=withdraw"},
103 {Opt_err_panic, "errors=panic"},
100 {Opt_error, NULL} 104 {Opt_error, NULL}
101}; 105};
102 106
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
152 args->ar_localcaching = 1; 156 args->ar_localcaching = 1;
153 break; 157 break;
154 case Opt_debug: 158 case Opt_debug:
159 if (args->ar_errors == GFS2_ERRORS_PANIC) {
160 fs_info(sdp, "-o debug and -o errors=panic "
161 "are mutually exclusive.\n");
162 return -EINVAL;
163 }
155 args->ar_debug = 1; 164 args->ar_debug = 1;
156 break; 165 break;
157 case Opt_nodebug: 166 case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
205 return rv ? rv : -EINVAL; 214 return rv ? rv : -EINVAL;
206 } 215 }
207 break; 216 break;
217 case Opt_err_withdraw:
218 args->ar_errors = GFS2_ERRORS_WITHDRAW;
219 break;
220 case Opt_err_panic:
221 if (args->ar_debug) {
222 fs_info(sdp, "-o debug and -o errors=panic "
223 "are mutually exclusive.\n");
224 return -EINVAL;
225 }
226 args->ar_errors = GFS2_ERRORS_PANIC;
227 break;
208 case Opt_error: 228 case Opt_error:
209 default: 229 default:
210 fs_info(sdp, "invalid mount option: %s\n", o); 230 fs_info(sdp, "invalid mount option: %s\n", o);
@@ -768,7 +788,6 @@ restart:
768 /* Release stuff */ 788 /* Release stuff */
769 789
770 iput(sdp->sd_jindex); 790 iput(sdp->sd_jindex);
771 iput(sdp->sd_inum_inode);
772 iput(sdp->sd_statfs_inode); 791 iput(sdp->sd_statfs_inode);
773 iput(sdp->sd_rindex); 792 iput(sdp->sd_rindex);
774 iput(sdp->sd_quota_inode); 793 iput(sdp->sd_quota_inode);
@@ -779,10 +798,8 @@ restart:
779 if (!sdp->sd_args.ar_spectator) { 798 if (!sdp->sd_args.ar_spectator) {
780 gfs2_glock_dq_uninit(&sdp->sd_journal_gh); 799 gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
781 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); 800 gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
782 gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
783 gfs2_glock_dq_uninit(&sdp->sd_sc_gh); 801 gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
784 gfs2_glock_dq_uninit(&sdp->sd_qc_gh); 802 gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
785 iput(sdp->sd_ir_inode);
786 iput(sdp->sd_sc_inode); 803 iput(sdp->sd_sc_inode);
787 iput(sdp->sd_qc_inode); 804 iput(sdp->sd_qc_inode);
788 } 805 }
@@ -1084,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1084 gt->gt_log_flush_secs = args.ar_commit; 1101 gt->gt_log_flush_secs = args.ar_commit;
1085 spin_unlock(&gt->gt_spin); 1102 spin_unlock(&gt->gt_spin);
1086 1103
1104 gfs2_online_uevent(sdp);
1087 return 0; 1105 return 0;
1088} 1106}
1089 1107
@@ -1225,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1225 lfsecs = sdp->sd_tune.gt_log_flush_secs; 1243 lfsecs = sdp->sd_tune.gt_log_flush_secs;
1226 if (lfsecs != 60) 1244 if (lfsecs != 60)
1227 seq_printf(s, ",commit=%d", lfsecs); 1245 seq_printf(s, ",commit=%d", lfsecs);
1246 if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
1247 const char *state;
1248
1249 switch (args->ar_errors) {
1250 case GFS2_ERRORS_WITHDRAW:
1251 state = "withdraw";
1252 break;
1253 case GFS2_ERRORS_PANIC:
1254 state = "panic";
1255 break;
1256 default:
1257 state = "unknown";
1258 break;
1259 }
1260 seq_printf(s, ",errors=%s", state);
1261 }
1228 return 0; 1262 return 0;
1229} 1263}
1230 1264
@@ -1252,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
1252 goto out; 1286 goto out;
1253 } 1287 }
1254 1288
1289 error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
1290 if (error)
1291 goto out_truncate;
1292
1255 gfs2_glock_dq_wait(&ip->i_iopen_gh); 1293 gfs2_glock_dq_wait(&ip->i_iopen_gh);
1256 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); 1294 gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
1257 error = gfs2_glock_nq(&ip->i_iopen_gh); 1295 error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 22e0417ed996..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
25 return x; 25 return x;
26} 26}
27 27
28void gfs2_jindex_free(struct gfs2_sbd *sdp); 28extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
29 29
30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); 30extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
31 31
@@ -36,7 +36,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
36 struct gfs2_inode **ipp); 36 struct gfs2_inode **ipp);
37 37
38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); 38extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
39 39extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
40extern int gfs2_statfs_init(struct gfs2_sbd *sdp); 40extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, 41extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
42 s64 dinodes); 42 s64 dinodes);
@@ -54,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[];
57 58
58#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
59 60
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index a7cbfbd340c7..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/genhd.h>
19 20
20#include "gfs2.h" 21#include "gfs2.h"
21#include "incore.h" 22#include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
319 return ret; 320 return ret;
320} 321}
321 322
322static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
323{
324 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
325 return sprintf(buf, "%u\n", ls->ls_id);
326}
327
328static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) 323static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
329{ 324{
330 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 325 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -389,7 +384,6 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
389GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); 384GDLM_ATTR(proto_name, 0444, proto_name_show, NULL);
390GDLM_ATTR(block, 0644, block_show, block_store); 385GDLM_ATTR(block, 0644, block_show, block_store);
391GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); 386GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
392GDLM_ATTR(id, 0444, lkid_show, NULL);
393GDLM_ATTR(jid, 0444, jid_show, NULL); 387GDLM_ATTR(jid, 0444, jid_show, NULL);
394GDLM_ATTR(first, 0444, lkfirst_show, NULL); 388GDLM_ATTR(first, 0444, lkfirst_show, NULL);
395GDLM_ATTR(first_done, 0444, first_done_show, NULL); 389GDLM_ATTR(first_done, 0444, first_done_show, NULL);
@@ -401,7 +395,6 @@ static struct attribute *lock_module_attrs[] = {
401 &gdlm_attr_proto_name.attr, 395 &gdlm_attr_proto_name.attr,
402 &gdlm_attr_block.attr, 396 &gdlm_attr_block.attr,
403 &gdlm_attr_withdraw.attr, 397 &gdlm_attr_withdraw.attr,
404 &gdlm_attr_id.attr,
405 &gdlm_attr_jid.attr, 398 &gdlm_attr_jid.attr,
406 &gdlm_attr_first.attr, 399 &gdlm_attr_first.attr,
407 &gdlm_attr_first_done.attr, 400 &gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
519 512
520int gfs2_sys_fs_add(struct gfs2_sbd *sdp) 513int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
521{ 514{
515 struct super_block *sb = sdp->sd_vfs;
522 int error; 516 int error;
517 char ro[20];
518 char spectator[20];
519 char *envp[] = { ro, spectator, NULL };
520
521 sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
522 sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
523 523
524 sdp->sd_kobj.kset = gfs2_kset; 524 sdp->sd_kobj.kset = gfs2_kset;
525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, 525 error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
535 if (error) 535 if (error)
536 goto fail_tune; 536 goto fail_tune;
537 537
538 kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); 538 error = sysfs_create_link(&sdp->sd_kobj,
539 &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
540 "device");
541 if (error)
542 goto fail_lock_module;
543
544 kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
539 return 0; 545 return 0;
540 546
547fail_lock_module:
548 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
541fail_tune: 549fail_tune:
542 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 550 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
543fail_reg: 551fail_reg:
@@ -549,12 +557,12 @@ fail:
549 557
550void gfs2_sys_fs_del(struct gfs2_sbd *sdp) 558void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
551{ 559{
560 sysfs_remove_link(&sdp->sd_kobj, "device");
552 sysfs_remove_group(&sdp->sd_kobj, &tune_group); 561 sysfs_remove_group(&sdp->sd_kobj, &tune_group);
553 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); 562 sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
554 kobject_put(&sdp->sd_kobj); 563 kobject_put(&sdp->sd_kobj);
555} 564}
556 565
557
558static int gfs2_uevent(struct kset *kset, struct kobject *kobj, 566static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
559 struct kobj_uevent_env *env) 567 struct kobj_uevent_env *env)
560{ 568{
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
563 571
564 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 572 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
565 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 573 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
574 if (!sdp->sd_args.ar_spectator)
575 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
566 if (gfs2_uuid_valid(uuid)) { 576 if (gfs2_uuid_valid(uuid)) {
567 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" 577 add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
568 "%02X%02X-%02X%02X%02X%02X%02X%02X", 578 "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
578 .uevent = gfs2_uevent, 588 .uevent = gfs2_uevent,
579}; 589};
580 590
581
582int gfs2_sys_init(void) 591int gfs2_sys_init(void)
583{ 592{
584 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); 593 gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
38 const struct lm_lockops *lm = ls->ls_ops; 38 const struct lm_lockops *lm = ls->ls_ops;
39 va_list args; 39 va_list args;
40 40
41 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) 41 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
42 test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
42 return 0; 43 return 0;
43 44
44 va_start(args, fmt); 45 va_start(args, fmt);
45 vprintk(fmt, args); 46 vprintk(fmt, args);
46 va_end(args); 47 va_end(args);
47 48
48 fs_err(sdp, "about to withdraw this file system\n"); 49 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
49 BUG_ON(sdp->sd_args.ar_debug); 50 fs_err(sdp, "about to withdraw this file system\n");
51 BUG_ON(sdp->sd_args.ar_debug);
50 52
51 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); 53 kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
52 54
53 if (lm->lm_unmount) { 55 if (lm->lm_unmount) {
54 fs_err(sdp, "telling LM to unmount\n"); 56 fs_err(sdp, "telling LM to unmount\n");
55 lm->lm_unmount(sdp); 57 lm->lm_unmount(sdp);
58 }
59 fs_err(sdp, "withdrawn\n");
60 dump_stack();
56 } 61 }
57 fs_err(sdp, "withdrawn\n"); 62
58 dump_stack(); 63 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
64 panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
59 65
60 return -1; 66 return -1;
61} 67}
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
93 gfs2_tune_get(sdp, gt_complain_secs) * HZ)) 99 gfs2_tune_get(sdp, gt_complain_secs) * HZ))
94 return -2; 100 return -2;
95 101
96 printk(KERN_WARNING 102 if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
97 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" 103 printk(KERN_WARNING
98 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", 104 "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
99 sdp->sd_fsname, assertion, 105 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
100 sdp->sd_fsname, function, file, line); 106 sdp->sd_fsname, assertion,
107 sdp->sd_fsname, function, file, line);
101 108
102 if (sdp->sd_args.ar_debug) 109 if (sdp->sd_args.ar_debug)
103 BUG(); 110 BUG();
104 else 111 else
105 dump_stack(); 112 dump_stack();
106 113
114 if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
115 panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
116 "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
117 sdp->sd_fsname, assertion,
118 sdp->sd_fsname, function, file, line);
119
107 sdp->sd_last_warning = jiffies; 120 sdp->sd_last_warning = jiffies;
108 121
109 return -1; 122 return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
18#include "gfs2.h" 18#include "gfs2.h"
19#include "incore.h" 19#include "incore.h"
20#include "acl.h" 20#include "acl.h"
21#include "eaops.h" 21#include "xattr.h"
22#include "eattr.h"
23#include "glock.h" 22#include "glock.h"
24#include "inode.h" 23#include "inode.h"
25#include "meta_io.h" 24#include "meta_io.h"
@@ -38,26 +37,32 @@
38 * Returns: 1 if the EA should be stuffed 37 * Returns: 1 if the EA should be stuffed
39 */ 38 */
40 39
41static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, 40static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
42 unsigned int *size) 41 unsigned int *size)
43{ 42{
44 *size = GFS2_EAREQ_SIZE_STUFFED(er); 43 unsigned int jbsize = sdp->sd_jbsize;
45 if (*size <= sdp->sd_jbsize) 44
45 /* Stuffed */
46 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
47
48 if (*size <= jbsize)
46 return 1; 49 return 1;
47 50
48 *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); 51 /* Unstuffed */
52 *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
53 (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
49 54
50 return 0; 55 return 0;
51} 56}
52 57
53static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) 58static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
54{ 59{
55 unsigned int size; 60 unsigned int size;
56 61
57 if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) 62 if (dsize > GFS2_EA_MAX_DATA_LEN)
58 return -ERANGE; 63 return -ERANGE;
59 64
60 ea_calc_size(sdp, er, &size); 65 ea_calc_size(sdp, nsize, dsize, &size);
61 66
62 /* This can only happen with 512 byte blocks */ 67 /* This can only happen with 512 byte blocks */
63 if (size > sdp->sd_jbsize) 68 if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
151} 156}
152 157
153struct ea_find { 158struct ea_find {
154 struct gfs2_ea_request *ef_er; 159 int type;
160 const char *name;
161 size_t namel;
155 struct gfs2_ea_location *ef_el; 162 struct gfs2_ea_location *ef_el;
156}; 163};
157 164
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
160 void *private) 167 void *private)
161{ 168{
162 struct ea_find *ef = private; 169 struct ea_find *ef = private;
163 struct gfs2_ea_request *er = ef->ef_er;
164 170
165 if (ea->ea_type == GFS2_EATYPE_UNUSED) 171 if (ea->ea_type == GFS2_EATYPE_UNUSED)
166 return 0; 172 return 0;
167 173
168 if (ea->ea_type == er->er_type) { 174 if (ea->ea_type == ef->type) {
169 if (ea->ea_name_len == er->er_name_len && 175 if (ea->ea_name_len == ef->namel &&
170 !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { 176 !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
171 struct gfs2_ea_location *el = ef->ef_el; 177 struct gfs2_ea_location *el = ef->ef_el;
172 get_bh(bh); 178 get_bh(bh);
173 el->el_bh = bh; 179 el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
180 return 0; 186 return 0;
181} 187}
182 188
183int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, 189int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
184 struct gfs2_ea_location *el) 190 struct gfs2_ea_location *el)
185{ 191{
186 struct ea_find ef; 192 struct ea_find ef;
187 int error; 193 int error;
188 194
189 ef.ef_er = er; 195 ef.type = type;
196 ef.name = name;
197 ef.namel = strlen(name);
190 ef.ef_el = el; 198 ef.ef_el = el;
191 199
192 memset(el, 0, sizeof(struct gfs2_ea_location)); 200 memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
344 unsigned int ei_size; 352 unsigned int ei_size;
345}; 353};
346 354
355static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
356{
357 switch (ea->ea_type) {
358 case GFS2_EATYPE_USR:
359 return 5 + ea->ea_name_len + 1;
360 case GFS2_EATYPE_SYS:
361 return 7 + ea->ea_name_len + 1;
362 case GFS2_EATYPE_SECURITY:
363 return 9 + ea->ea_name_len + 1;
364 default:
365 return 0;
366 }
367}
368
347static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, 369static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
348 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, 370 struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
349 void *private) 371 void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
392} 414}
393 415
394/** 416/**
395 * gfs2_ea_list - 417 * gfs2_listxattr - List gfs2 extended attributes
396 * @ip: 418 * @dentry: The dentry whose inode we are interested in
397 * @er: 419 * @buffer: The buffer to write the results
420 * @size: The size of the buffer
398 * 421 *
399 * Returns: actual size of data on success, -errno on error 422 * Returns: actual size of data on success, -errno on error
400 */ 423 */
401 424
402int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) 425ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
403{ 426{
427 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
428 struct gfs2_ea_request er;
404 struct gfs2_holder i_gh; 429 struct gfs2_holder i_gh;
405 int error; 430 int error;
406 431
407 if (!er->er_data || !er->er_data_len) { 432 memset(&er, 0, sizeof(struct gfs2_ea_request));
408 er->er_data = NULL; 433 if (size) {
409 er->er_data_len = 0; 434 er.er_data = buffer;
435 er.er_data_len = size;
410 } 436 }
411 437
412 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 438 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
414 return error; 440 return error;
415 441
416 if (ip->i_eattr) { 442 if (ip->i_eattr) {
417 struct ea_list ei = { .ei_er = er, .ei_size = 0 }; 443 struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
418 444
419 error = ea_foreach(ip, ea_list_i, &ei); 445 error = ea_foreach(ip, ea_list_i, &ei);
420 if (!error) 446 if (!error)
@@ -491,84 +517,61 @@ out:
491} 517}
492 518
493int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, 519int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
494 char *data) 520 char *data, size_t size)
495{ 521{
522 int ret;
523 size_t len = GFS2_EA_DATA_LEN(el->el_ea);
524 if (len > size)
525 return -ERANGE;
526
496 if (GFS2_EA_IS_STUFFED(el->el_ea)) { 527 if (GFS2_EA_IS_STUFFED(el->el_ea)) {
497 memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); 528 memcpy(data, GFS2_EA2DATA(el->el_ea), len);
498 return 0; 529 return len;
499 } else 530 }
500 return ea_get_unstuffed(ip, el->el_ea, data); 531 ret = ea_get_unstuffed(ip, el->el_ea, data);
532 if (ret < 0)
533 return ret;
534 return len;
501} 535}
502 536
503/** 537/**
504 * gfs2_ea_get_i - 538 * gfs2_xattr_get - Get a GFS2 extended attribute
505 * @ip: The GFS2 inode 539 * @inode: The inode
506 * @er: The request structure 540 * @type: The type of extended attribute
541 * @name: The name of the extended attribute
542 * @buffer: The buffer to write the result into
543 * @size: The size of the buffer
507 * 544 *
508 * Returns: actual size of data on success, -errno on error 545 * Returns: actual size of data on success, -errno on error
509 */ 546 */
510 547
511int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 548int gfs2_xattr_get(struct inode *inode, int type, const char *name,
549 void *buffer, size_t size)
512{ 550{
551 struct gfs2_inode *ip = GFS2_I(inode);
513 struct gfs2_ea_location el; 552 struct gfs2_ea_location el;
514 int error; 553 int error;
515 554
516 if (!ip->i_eattr) 555 if (!ip->i_eattr)
517 return -ENODATA; 556 return -ENODATA;
557 if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
558 return -EINVAL;
518 559
519 error = gfs2_ea_find(ip, er, &el); 560 error = gfs2_ea_find(ip, type, name, &el);
520 if (error) 561 if (error)
521 return error; 562 return error;
522 if (!el.el_ea) 563 if (!el.el_ea)
523 return -ENODATA; 564 return -ENODATA;
524 565 if (size)
525 if (er->er_data_len) { 566 error = gfs2_ea_get_copy(ip, &el, buffer, size);
526 if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) 567 else
527 error = -ERANGE;
528 else
529 error = gfs2_ea_get_copy(ip, &el, er->er_data);
530 }
531 if (!error)
532 error = GFS2_EA_DATA_LEN(el.el_ea); 568 error = GFS2_EA_DATA_LEN(el.el_ea);
533
534 brelse(el.el_bh); 569 brelse(el.el_bh);
535 570
536 return error; 571 return error;
537} 572}
538 573
539/** 574/**
540 * gfs2_ea_get -
541 * @ip: The GFS2 inode
542 * @er: The request structure
543 *
544 * Returns: actual size of data on success, -errno on error
545 */
546
547int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
548{
549 struct gfs2_holder i_gh;
550 int error;
551
552 if (!er->er_name_len ||
553 er->er_name_len > GFS2_EA_MAX_NAME_LEN)
554 return -EINVAL;
555 if (!er->er_data || !er->er_data_len) {
556 er->er_data = NULL;
557 er->er_data_len = 0;
558 }
559
560 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
561 if (error)
562 return error;
563
564 error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
565
566 gfs2_glock_dq_uninit(&i_gh);
567
568 return error;
569}
570
571/**
572 * ea_alloc_blk - allocates a new block for extended attributes. 575 * ea_alloc_blk - allocates a new block for extended attributes.
573 * @ip: A pointer to the inode that's getting extended attributes 576 * @ip: A pointer to the inode that's getting extended attributes
574 * @bhp: Pointer to pointer to a struct buffer_head 577 * @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
713 716
714 error = gfs2_meta_inode_buffer(ip, &dibh); 717 error = gfs2_meta_inode_buffer(ip, &dibh);
715 if (!error) { 718 if (!error) {
716 if (er->er_flags & GFS2_ERF_MODE) {
717 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
718 (ip->i_inode.i_mode & S_IFMT) ==
719 (er->er_mode & S_IFMT));
720 ip->i_inode.i_mode = er->er_mode;
721 }
722 ip->i_inode.i_ctime = CURRENT_TIME; 719 ip->i_inode.i_ctime = CURRENT_TIME;
723 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 720 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
724 gfs2_dinode_out(ip, dibh->b_data); 721 gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
762 * Returns: errno 759 * Returns: errno
763 */ 760 */
764 761
765static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) 762static int ea_init(struct gfs2_inode *ip, int type, const char *name,
763 const void *data, size_t size)
766{ 764{
765 struct gfs2_ea_request er;
767 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; 766 unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
768 unsigned int blks = 1; 767 unsigned int blks = 1;
769 768
770 if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) 769 er.er_type = type;
771 blks += DIV_ROUND_UP(er->er_data_len, jbsize); 770 er.er_name = name;
771 er.er_name_len = strlen(name);
772 er.er_data = (void *)data;
773 er.er_data_len = size;
774
775 if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
776 blks += DIV_ROUND_UP(er.er_data_len, jbsize);
772 777
773 return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); 778 return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
774} 779}
775 780
776static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) 781static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
848 error = gfs2_meta_inode_buffer(ip, &dibh); 853 error = gfs2_meta_inode_buffer(ip, &dibh);
849 if (error) 854 if (error)
850 goto out; 855 goto out;
851
852 if (er->er_flags & GFS2_ERF_MODE) {
853 gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
854 (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
855 ip->i_inode.i_mode = er->er_mode;
856 }
857 ip->i_inode.i_ctime = CURRENT_TIME; 856 ip->i_inode.i_ctime = CURRENT_TIME;
858 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 857 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
859 gfs2_dinode_out(ip, dibh->b_data); 858 gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
894 int stuffed; 893 int stuffed;
895 int error; 894 int error;
896 895
897 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); 896 stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
897 es->es_er->er_data_len, &size);
898 898
899 if (ea->ea_type == GFS2_EATYPE_UNUSED) { 899 if (ea->ea_type == GFS2_EATYPE_UNUSED) {
900 if (GFS2_EA_REC_LEN(ea) < size) 900 if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
1005 return error; 1005 return error;
1006} 1006}
1007 1007
1008static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, 1008static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
1009 struct gfs2_ea_location *el) 1009 const void *value, size_t size, struct gfs2_ea_location *el)
1010{ 1010{
1011 struct gfs2_ea_request er;
1011 struct ea_set es; 1012 struct ea_set es;
1012 unsigned int blks = 2; 1013 unsigned int blks = 2;
1013 int error; 1014 int error;
1014 1015
1016 er.er_type = type;
1017 er.er_name = name;
1018 er.er_data = (void *)value;
1019 er.er_name_len = strlen(name);
1020 er.er_data_len = size;
1021
1015 memset(&es, 0, sizeof(struct ea_set)); 1022 memset(&es, 0, sizeof(struct ea_set));
1016 es.es_er = er; 1023 es.es_er = &er;
1017 es.es_el = el; 1024 es.es_el = el;
1018 1025
1019 error = ea_foreach(ip, ea_set_simple, &es); 1026 error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
1024 1031
1025 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) 1032 if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
1026 blks++; 1033 blks++;
1027 if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) 1034 if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
1028 blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); 1035 blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
1029 1036
1030 return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); 1037 return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
1031} 1038}
1032 1039
1033static int ea_set_remove_unstuffed(struct gfs2_inode *ip, 1040static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
1039 GFS2_EA2NEXT(el->el_prev) == el->el_ea); 1046 GFS2_EA2NEXT(el->el_prev) == el->el_ea);
1040 } 1047 }
1041 1048
1042 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); 1049 return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
1043}
1044
1045int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1046{
1047 struct gfs2_ea_location el;
1048 int error;
1049
1050 if (!ip->i_eattr) {
1051 if (er->er_flags & XATTR_REPLACE)
1052 return -ENODATA;
1053 return ea_init(ip, er);
1054 }
1055
1056 error = gfs2_ea_find(ip, er, &el);
1057 if (error)
1058 return error;
1059
1060 if (el.el_ea) {
1061 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1062 brelse(el.el_bh);
1063 return -EPERM;
1064 }
1065
1066 error = -EEXIST;
1067 if (!(er->er_flags & XATTR_CREATE)) {
1068 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1069 error = ea_set_i(ip, er, &el);
1070 if (!error && unstuffed)
1071 ea_set_remove_unstuffed(ip, &el);
1072 }
1073
1074 brelse(el.el_bh);
1075 } else {
1076 error = -ENODATA;
1077 if (!(er->er_flags & XATTR_REPLACE))
1078 error = ea_set_i(ip, er, NULL);
1079 }
1080
1081 return error;
1082}
1083
1084int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1085{
1086 struct gfs2_holder i_gh;
1087 int error;
1088
1089 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
1090 return -EINVAL;
1091 if (!er->er_data || !er->er_data_len) {
1092 er->er_data = NULL;
1093 er->er_data_len = 0;
1094 }
1095 error = ea_check_size(GFS2_SB(&ip->i_inode), er);
1096 if (error)
1097 return error;
1098
1099 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
1100 if (error)
1101 return error;
1102
1103 if (IS_IMMUTABLE(&ip->i_inode))
1104 error = -EPERM;
1105 else
1106 error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
1107
1108 gfs2_glock_dq_uninit(&i_gh);
1109
1110 return error;
1111} 1050}
1112 1051
1113static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) 1052static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1131 1070
1132 if (GFS2_EA_IS_LAST(ea)) 1071 if (GFS2_EA_IS_LAST(ea))
1133 prev->ea_flags |= GFS2_EAFLAG_LAST; 1072 prev->ea_flags |= GFS2_EAFLAG_LAST;
1134 } else 1073 } else {
1135 ea->ea_type = GFS2_EATYPE_UNUSED; 1074 ea->ea_type = GFS2_EATYPE_UNUSED;
1075 }
1136 1076
1137 error = gfs2_meta_inode_buffer(ip, &dibh); 1077 error = gfs2_meta_inode_buffer(ip, &dibh);
1138 if (!error) { 1078 if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
1147 return error; 1087 return error;
1148} 1088}
1149 1089
1150int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1090/**
1091 * gfs2_xattr_remove - Remove a GFS2 extended attribute
1092 * @inode: The inode
1093 * @type: The type of the extended attribute
1094 * @name: The name of the extended attribute
1095 *
1096 * This is not called directly by the VFS since we use the (common)
1097 * scheme of making a "set with NULL data" mean a remove request. Note
1098 * that this is different from a set with zero length data.
1099 *
1100 * Returns: 0, or errno on failure
1101 */
1102
1103static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
1151{ 1104{
1105 struct gfs2_inode *ip = GFS2_I(inode);
1152 struct gfs2_ea_location el; 1106 struct gfs2_ea_location el;
1153 int error; 1107 int error;
1154 1108
1155 if (!ip->i_eattr) 1109 if (!ip->i_eattr)
1156 return -ENODATA; 1110 return -ENODATA;
1157 1111
1158 error = gfs2_ea_find(ip, er, &el); 1112 error = gfs2_ea_find(ip, type, name, &el);
1159 if (error) 1113 if (error)
1160 return error; 1114 return error;
1161 if (!el.el_ea) 1115 if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1164 if (GFS2_EA_IS_STUFFED(el.el_ea)) 1118 if (GFS2_EA_IS_STUFFED(el.el_ea))
1165 error = ea_remove_stuffed(ip, &el); 1119 error = ea_remove_stuffed(ip, &el);
1166 else 1120 else
1167 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 1121 error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
1168 0);
1169 1122
1170 brelse(el.el_bh); 1123 brelse(el.el_bh);
1171 1124
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
1173} 1126}
1174 1127
1175/** 1128/**
1176 * gfs2_ea_remove - sets (or creates or replaces) an extended attribute 1129 * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
1177 * @ip: pointer to the inode of the target file 1130 * @inode: The inode
1178 * @er: request information 1131 * @type: The type of the extended attribute
1132 * @name: The name of the extended attribute
1133 * @value: The value of the extended attribute (NULL for remove)
1134 * @size: The size of the @value argument
1135 * @flags: Create or Replace
1179 * 1136 *
1180 * Returns: errno 1137 * See gfs2_xattr_remove() for details of the removal of xattrs.
1138 *
1139 * Returns: 0 or errno on failure
1181 */ 1140 */
1182 1141
1183int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) 1142int gfs2_xattr_set(struct inode *inode, int type, const char *name,
1143 const void *value, size_t size, int flags)
1184{ 1144{
1185 struct gfs2_holder i_gh; 1145 struct gfs2_sbd *sdp = GFS2_SB(inode);
1146 struct gfs2_inode *ip = GFS2_I(inode);
1147 struct gfs2_ea_location el;
1148 unsigned int namel = strlen(name);
1186 int error; 1149 int error;
1187 1150
1188 if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) 1151 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
1189 return -EINVAL; 1152 return -EPERM;
1153 if (namel > GFS2_EA_MAX_NAME_LEN)
1154 return -ERANGE;
1190 1155
1191 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); 1156 if (value == NULL)
1157 return gfs2_xattr_remove(inode, type, name);
1158
1159 if (ea_check_size(sdp, namel, size))
1160 return -ERANGE;
1161
1162 if (!ip->i_eattr) {
1163 if (flags & XATTR_REPLACE)
1164 return -ENODATA;
1165 return ea_init(ip, type, name, value, size);
1166 }
1167
1168 error = gfs2_ea_find(ip, type, name, &el);
1192 if (error) 1169 if (error)
1193 return error; 1170 return error;
1194 1171
1195 if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) 1172 if (el.el_ea) {
1196 error = -EPERM; 1173 if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
1197 else 1174 brelse(el.el_bh);
1198 error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); 1175 return -EPERM;
1176 }
1199 1177
1200 gfs2_glock_dq_uninit(&i_gh); 1178 error = -EEXIST;
1179 if (!(flags & XATTR_CREATE)) {
1180 int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
1181 error = ea_set_i(ip, type, name, value, size, &el);
1182 if (!error && unstuffed)
1183 ea_set_remove_unstuffed(ip, &el);
1184 }
1185
1186 brelse(el.el_bh);
1187 return error;
1188 }
1189
1190 error = -ENODATA;
1191 if (!(flags & XATTR_REPLACE))
1192 error = ea_set_i(ip, type, name, value, size, NULL);
1201 1193
1202 return error; 1194 return error;
1203} 1195}
@@ -1503,3 +1495,64 @@ out_alloc:
1503 return error; 1495 return error;
1504} 1496}
1505 1497
1498static int gfs2_xattr_user_get(struct inode *inode, const char *name,
1499 void *buffer, size_t size)
1500{
1501 return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
1502}
1503
1504static int gfs2_xattr_user_set(struct inode *inode, const char *name,
1505 const void *value, size_t size, int flags)
1506{
1507 return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
1508}
1509
1510static int gfs2_xattr_system_get(struct inode *inode, const char *name,
1511 void *buffer, size_t size)
1512{
1513 return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
1514}
1515
1516static int gfs2_xattr_system_set(struct inode *inode, const char *name,
1517 const void *value, size_t size, int flags)
1518{
1519 return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
1520}
1521
1522static int gfs2_xattr_security_get(struct inode *inode, const char *name,
1523 void *buffer, size_t size)
1524{
1525 return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
1526}
1527
1528static int gfs2_xattr_security_set(struct inode *inode, const char *name,
1529 const void *value, size_t size, int flags)
1530{
1531 return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
1532}
1533
1534static struct xattr_handler gfs2_xattr_user_handler = {
1535 .prefix = XATTR_USER_PREFIX,
1536 .get = gfs2_xattr_user_get,
1537 .set = gfs2_xattr_user_set,
1538};
1539
1540static struct xattr_handler gfs2_xattr_security_handler = {
1541 .prefix = XATTR_SECURITY_PREFIX,
1542 .get = gfs2_xattr_security_get,
1543 .set = gfs2_xattr_security_set,
1544};
1545
1546static struct xattr_handler gfs2_xattr_system_handler = {
1547 .prefix = XATTR_SYSTEM_PREFIX,
1548 .get = gfs2_xattr_system_get,
1549 .set = gfs2_xattr_system_set,
1550};
1551
1552struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler,
1556 NULL,
1557};
1558
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
19#define GFS2_EA_SIZE(ea) \ 19#define GFS2_EA_SIZE(ea) \
20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ 20ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ 21 ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) 22 (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
23 23
24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) 24#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) 25#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
27#define GFS2_EAREQ_SIZE_STUFFED(er) \ 27#define GFS2_EAREQ_SIZE_STUFFED(er) \
28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) 28ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
29 29
30#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
31ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
32 sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
33
34#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) 30#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
35#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) 31#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
36 32
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
43#define GFS2_EA_BH2FIRST(bh) \ 39#define GFS2_EA_BH2FIRST(bh) \
44((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) 40((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
45 41
46#define GFS2_ERF_MODE 0x80000000
47
48struct gfs2_ea_request { 42struct gfs2_ea_request {
49 const char *er_name; 43 const char *er_name;
50 char *er_data; 44 char *er_data;
51 unsigned int er_name_len; 45 unsigned int er_name_len;
52 unsigned int er_data_len; 46 unsigned int er_data_len;
53 unsigned int er_type; /* GFS2_EATYPE_... */ 47 unsigned int er_type; /* GFS2_EATYPE_... */
54 int er_flags;
55 mode_t er_mode;
56}; 48};
57 49
58struct gfs2_ea_location { 50struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
61 struct gfs2_ea_header *el_prev; 53 struct gfs2_ea_header *el_prev;
62}; 54};
63 55
64int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 56extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
65int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 57 void *buffer, size_t size);
66int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); 58extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
67 59 const void *value, size_t size, int flags);
68int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); 60extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
69int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); 61extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
70int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
71int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
72
73int gfs2_ea_dealloc(struct gfs2_inode *ip);
74 62
75/* Exported to acl.c */ 63/* Exported to acl.c */
76 64
77int gfs2_ea_find(struct gfs2_inode *ip, 65extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
78 struct gfs2_ea_request *er, 66 struct gfs2_ea_location *el);
79 struct gfs2_ea_location *el); 67extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
80int gfs2_ea_get_copy(struct gfs2_inode *ip, 68 char *data, size_t size);
81 struct gfs2_ea_location *el, 69extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
82 char *data); 70 struct iattr *attr, char *data);
83int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
84 struct iattr *attr, char *data);
85
86static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
87{
88 switch (ea->ea_type) {
89 case GFS2_EATYPE_USR:
90 return 5 + ea->ea_name_len + 1;
91 case GFS2_EATYPE_SYS:
92 return 7 + ea->ea_name_len + 1;
93 case GFS2_EATYPE_SECURITY:
94 return 9 + ea->ea_name_len + 1;
95 default:
96 return 0;
97 }
98}
99 71
100#endif /* __EATTR_DOT_H__ */ 72#endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index cb88dac8ccaa..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
44static const struct inode_operations hugetlbfs_inode_operations; 44static const struct inode_operations hugetlbfs_inode_operations;
45 45
46static struct backing_dev_info hugetlbfs_backing_dev_info = { 46static struct backing_dev_info hugetlbfs_backing_dev_info = {
47 .name = "hugetlbfs",
47 .ra_pages = 0, /* No readahead */ 48 .ra_pages = 0, /* No readahead */
48 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 49 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
49}; 50};
diff --git a/fs/inode.c b/fs/inode.c
index ae7b67e48661..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -182,9 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
182 if (sb->s_bdev) { 182 if (sb->s_bdev) {
183 struct backing_dev_info *bdi; 183 struct backing_dev_info *bdi;
184 184
185 bdi = sb->s_bdev->bd_inode_backing_dev_info; 185 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
186 if (!bdi)
187 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
188 mapping->backing_dev_info = bdi; 186 mapping->backing_dev_info = bdi;
189 } 187 }
190 inode->i_private = NULL; 188 inode->i_private = NULL;
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
456{ 456{
457 transaction_t * transaction; 457 transaction_t * transaction;
458 tid_t first_tid; 458 tid_t first_tid;
459 unsigned long blocknr, freed; 459 unsigned int blocknr, freed;
460 460
461 if (is_journal_aborted(journal)) 461 if (is_journal_aborted(journal))
462 return 1; 462 return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
502 freed = freed + journal->j_last - journal->j_first; 502 freed = freed + journal->j_last - journal->j_first;
503 503
504 jbd_debug(1, 504 jbd_debug(1,
505 "Cleaning journal tail from %d to %d (offset %lu), " 505 "Cleaning journal tail from %d to %d (offset %u), "
506 "freeing %lu\n", 506 "freeing %u\n",
507 journal->j_tail_sequence, first_tid, blocknr, freed); 507 journal->j_tail_sequence, first_tid, blocknr, freed);
508 508
509 journal->j_free += freed; 509 journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
308 int bufs; 308 int bufs;
309 int flags; 309 int flags;
310 int err; 310 int err;
311 unsigned long blocknr; 311 unsigned int blocknr;
312 ktime_t start_time; 312 ktime_t start_time;
313 u64 commit_time; 313 u64 commit_time;
314 char *tagp = NULL; 314 char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f96f85092d1c..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
276int journal_write_metadata_buffer(transaction_t *transaction, 276int journal_write_metadata_buffer(transaction_t *transaction,
277 struct journal_head *jh_in, 277 struct journal_head *jh_in,
278 struct journal_head **jh_out, 278 struct journal_head **jh_out,
279 unsigned long blocknr) 279 unsigned int blocknr)
280{ 280{
281 int need_copy_out = 0; 281 int need_copy_out = 0;
282 int done_copy_out = 0; 282 int done_copy_out = 0;
@@ -567,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
567 * Log buffer allocation routines: 567 * Log buffer allocation routines:
568 */ 568 */
569 569
570int journal_next_log_block(journal_t *journal, unsigned long *retp) 570int journal_next_log_block(journal_t *journal, unsigned int *retp)
571{ 571{
572 unsigned long blocknr; 572 unsigned int blocknr;
573 573
574 spin_lock(&journal->j_state_lock); 574 spin_lock(&journal->j_state_lock);
575 J_ASSERT(journal->j_free > 1); 575 J_ASSERT(journal->j_free > 1);
@@ -590,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
590 * this is a no-op. If needed, we can use j_blk_offset - everything is 590 * this is a no-op. If needed, we can use j_blk_offset - everything is
591 * ready. 591 * ready.
592 */ 592 */
593int journal_bmap(journal_t *journal, unsigned long blocknr, 593int journal_bmap(journal_t *journal, unsigned int blocknr,
594 unsigned long *retp) 594 unsigned int *retp)
595{ 595{
596 int err = 0; 596 int err = 0;
597 unsigned long ret; 597 unsigned int ret;
598 598
599 if (journal->j_inode) { 599 if (journal->j_inode) {
600 ret = bmap(journal->j_inode, blocknr); 600 ret = bmap(journal->j_inode, blocknr);
@@ -604,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
604 char b[BDEVNAME_SIZE]; 604 char b[BDEVNAME_SIZE];
605 605
606 printk(KERN_ALERT "%s: journal block not found " 606 printk(KERN_ALERT "%s: journal block not found "
607 "at offset %lu on %s\n", 607 "at offset %u on %s\n",
608 __func__, 608 __func__,
609 blocknr, 609 blocknr,
610 bdevname(journal->j_dev, b)); 610 bdevname(journal->j_dev, b));
@@ -630,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
630struct journal_head *journal_get_descriptor_buffer(journal_t *journal) 630struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
631{ 631{
632 struct buffer_head *bh; 632 struct buffer_head *bh;
633 unsigned long blocknr; 633 unsigned int blocknr;
634 int err; 634 int err;
635 635
636 err = journal_next_log_block(journal, &blocknr); 636 err = journal_next_log_block(journal, &blocknr);
@@ -774,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
774 journal_t *journal = journal_init_common(); 774 journal_t *journal = journal_init_common();
775 int err; 775 int err;
776 int n; 776 int n;
777 unsigned long blocknr; 777 unsigned int blocknr;
778 778
779 if (!journal) 779 if (!journal)
780 return NULL; 780 return NULL;
@@ -846,12 +846,12 @@ static void journal_fail_superblock (journal_t *journal)
846static int journal_reset(journal_t *journal) 846static int journal_reset(journal_t *journal)
847{ 847{
848 journal_superblock_t *sb = journal->j_superblock; 848 journal_superblock_t *sb = journal->j_superblock;
849 unsigned long first, last; 849 unsigned int first, last;
850 850
851 first = be32_to_cpu(sb->s_first); 851 first = be32_to_cpu(sb->s_first);
852 last = be32_to_cpu(sb->s_maxlen); 852 last = be32_to_cpu(sb->s_maxlen);
853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { 853 if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
854 printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", 854 printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
855 first, last); 855 first, last);
856 journal_fail_superblock(journal); 856 journal_fail_superblock(journal);
857 return -EINVAL; 857 return -EINVAL;
@@ -885,7 +885,7 @@ static int journal_reset(journal_t *journal)
885 **/ 885 **/
886int journal_create(journal_t *journal) 886int journal_create(journal_t *journal)
887{ 887{
888 unsigned long blocknr; 888 unsigned int blocknr;
889 struct buffer_head *bh; 889 struct buffer_head *bh;
890 journal_superblock_t *sb; 890 journal_superblock_t *sb;
891 int i, err; 891 int i, err;
@@ -969,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
969 if (sb->s_start == 0 && journal->j_tail_sequence == 969 if (sb->s_start == 0 && journal->j_tail_sequence ==
970 journal->j_transaction_sequence) { 970 journal->j_transaction_sequence) {
971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb " 971 jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
972 "(start %ld, seq %d, errno %d)\n", 972 "(start %u, seq %d, errno %d)\n",
973 journal->j_tail, journal->j_tail_sequence, 973 journal->j_tail, journal->j_tail_sequence,
974 journal->j_errno); 974 journal->j_errno);
975 goto out; 975 goto out;
976 } 976 }
977 977
978 spin_lock(&journal->j_state_lock); 978 spin_lock(&journal->j_state_lock);
979 jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", 979 jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
980 journal->j_tail, journal->j_tail_sequence, journal->j_errno); 980 journal->j_tail, journal->j_tail_sequence, journal->j_errno);
981 981
982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); 982 sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1371,7 +1371,7 @@ int journal_flush(journal_t *journal)
1371{ 1371{
1372 int err = 0; 1372 int err = 0;
1373 transaction_t *transaction = NULL; 1373 transaction_t *transaction = NULL;
1374 unsigned long old_tail; 1374 unsigned int old_tail;
1375 1375
1376 spin_lock(&journal->j_state_lock); 1376 spin_lock(&journal->j_state_lock);
1377 1377
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
70{ 70{
71 int err; 71 int err;
72 unsigned int max, nbufs, next; 72 unsigned int max, nbufs, next;
73 unsigned long blocknr; 73 unsigned int blocknr;
74 struct buffer_head *bh; 74 struct buffer_head *bh;
75 75
76 struct buffer_head * bufs[MAXBUF]; 76 struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
132 unsigned int offset) 132 unsigned int offset)
133{ 133{
134 int err; 134 int err;
135 unsigned long blocknr; 135 unsigned int blocknr;
136 struct buffer_head *bh; 136 struct buffer_head *bh;
137 137
138 *bhp = NULL; 138 *bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
314 struct recovery_info *info, enum passtype pass) 314 struct recovery_info *info, enum passtype pass)
315{ 315{
316 unsigned int first_commit_ID, next_commit_ID; 316 unsigned int first_commit_ID, next_commit_ID;
317 unsigned long next_log_block; 317 unsigned int next_log_block;
318 int err, success = 0; 318 int err, success = 0;
319 journal_superblock_t * sb; 319 journal_superblock_t * sb;
320 journal_header_t * tmp; 320 journal_header_t * tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
367 if (tid_geq(next_commit_ID, info->end_transaction)) 367 if (tid_geq(next_commit_ID, info->end_transaction))
368 break; 368 break;
369 369
370 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", 370 jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
371 next_commit_ID, next_log_block, journal->j_last); 371 next_commit_ID, next_log_block, journal->j_last);
372 372
373 /* Skip over each chunk of the transaction looking 373 /* Skip over each chunk of the transaction looking
374 * either the next descriptor block or the final commit 374 * either the next descriptor block or the final commit
375 * record. */ 375 * record. */
376 376
377 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 377 jbd_debug(3, "JBD: checking block %u\n", next_log_block);
378 err = jread(&bh, journal, next_log_block); 378 err = jread(&bh, journal, next_log_block);
379 if (err) 379 if (err)
380 goto failed; 380 goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
429 tagp = &bh->b_data[sizeof(journal_header_t)]; 429 tagp = &bh->b_data[sizeof(journal_header_t)];
430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) 430 while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
431 <= journal->j_blocksize) { 431 <= journal->j_blocksize) {
432 unsigned long io_block; 432 unsigned int io_block;
433 433
434 tag = (journal_block_tag_t *) tagp; 434 tag = (journal_block_tag_t *) tagp;
435 flags = be32_to_cpu(tag->t_flags); 435 flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
443 success = err; 443 success = err;
444 printk (KERN_ERR 444 printk (KERN_ERR
445 "JBD: IO error %d recovering " 445 "JBD: IO error %d recovering "
446 "block %ld in log\n", 446 "block %u in log\n",
447 err, io_block); 447 err, io_block);
448 } else { 448 } else {
449 unsigned long blocknr; 449 unsigned int blocknr;
450 450
451 J_ASSERT(obh != NULL); 451 J_ASSERT(obh != NULL);
452 blocknr = be32_to_cpu(tag->t_blocknr); 452 blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
581 max = be32_to_cpu(header->r_count); 581 max = be32_to_cpu(header->r_count);
582 582
583 while (offset < max) { 583 while (offset < max) {
584 unsigned long blocknr; 584 unsigned int blocknr;
585 int err; 585 int err;
586 586
587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset))); 587 blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
101{ 101{
102 struct list_head hash; 102 struct list_head hash;
103 tid_t sequence; /* Used for recovery only */ 103 tid_t sequence; /* Used for recovery only */
104 unsigned long blocknr; 104 unsigned int blocknr;
105}; 105};
106 106
107 107
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
126/* Utility functions to maintain the revoke table */ 126/* Utility functions to maintain the revoke table */
127 127
128/* Borrowed from buffer.c: this is a tried and tested block hash function */ 128/* Borrowed from buffer.c: this is a tried and tested block hash function */
129static inline int hash(journal_t *journal, unsigned long block) 129static inline int hash(journal_t *journal, unsigned int block)
130{ 130{
131 struct jbd_revoke_table_s *table = journal->j_revoke; 131 struct jbd_revoke_table_s *table = journal->j_revoke;
132 int hash_shift = table->hash_shift; 132 int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
136 (block << (hash_shift - 12))) & (table->hash_size - 1); 136 (block << (hash_shift - 12))) & (table->hash_size - 1);
137} 137}
138 138
139static int insert_revoke_hash(journal_t *journal, unsigned long blocknr, 139static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
140 tid_t seq) 140 tid_t seq)
141{ 141{
142 struct list_head *hash_list; 142 struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
166/* Find a revoke record in the journal's hash table. */ 166/* Find a revoke record in the journal's hash table. */
167 167
168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, 168static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
169 unsigned long blocknr) 169 unsigned int blocknr)
170{ 170{
171 struct list_head *hash_list; 171 struct list_head *hash_list;
172 struct jbd_revoke_record_s *record; 172 struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
332 * by one. 332 * by one.
333 */ 333 */
334 334
335int journal_revoke(handle_t *handle, unsigned long blocknr, 335int journal_revoke(handle_t *handle, unsigned int blocknr,
336 struct buffer_head *bh_in) 336 struct buffer_head *bh_in)
337{ 337{
338 struct buffer_head *bh = NULL; 338 struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
401 } 401 }
402 } 402 }
403 403
404 jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); 404 jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
405 err = insert_revoke_hash(journal, blocknr, 405 err = insert_revoke_hash(journal, blocknr,
406 handle->h_transaction->t_tid); 406 handle->h_transaction->t_tid);
407 BUFFER_TRACE(bh_in, "exit"); 407 BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
644 */ 644 */
645 645
646int journal_set_revoke(journal_t *journal, 646int journal_set_revoke(journal_t *journal,
647 unsigned long blocknr, 647 unsigned int blocknr,
648 tid_t sequence) 648 tid_t sequence)
649{ 649{
650 struct jbd_revoke_record_s *record; 650 struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
668 */ 668 */
669 669
670int journal_test_revoke(journal_t *journal, 670int journal_test_revoke(journal_t *journal,
671 unsigned long blocknr, 671 unsigned int blocknr,
672 tid_t sequence) 672 tid_t sequence)
673{ 673{
674 struct jbd_revoke_record_s *record; 674 struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index c03ac11f74be..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
56 spin_lock_init(&transaction->t_handle_lock); 56 spin_lock_init(&transaction->t_handle_lock);
57 57
58 /* Set up the commit timer for the new transaction. */ 58 /* Set up the commit timer for the new transaction. */
59 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 59 journal->j_commit_timer.expires =
60 round_jiffies_up(transaction->t_expires);
60 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
61 62
62 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
228 __log_space_left(journal)); 229 __log_space_left(journal));
229 spin_unlock(&transaction->t_handle_lock); 230 spin_unlock(&transaction->t_handle_lock);
230 spin_unlock(&journal->j_state_lock); 231 spin_unlock(&journal->j_state_lock);
232
233 lock_map_acquire(&handle->h_lockdep_map);
231out: 234out:
232 if (unlikely(new_transaction)) /* It's usually NULL */ 235 if (unlikely(new_transaction)) /* It's usually NULL */
233 kfree(new_transaction); 236 kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
292 handle = ERR_PTR(err); 295 handle = ERR_PTR(err);
293 goto out; 296 goto out;
294 } 297 }
295
296 lock_map_acquire(&handle->h_lockdep_map);
297
298out: 298out:
299 return handle; 299 return handle;
300} 300}
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
416 __log_start_commit(journal, transaction->t_tid); 416 __log_start_commit(journal, transaction->t_tid);
417 spin_unlock(&journal->j_state_lock); 417 spin_unlock(&journal->j_state_lock);
418 418
419 lock_map_release(&handle->h_lockdep_map);
419 handle->h_buffer_credits = nblocks; 420 handle->h_buffer_credits = nblocks;
420 ret = start_this_handle(journal, handle); 421 ret = start_this_handle(journal, handle);
421 return ret; 422 return ret;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
25#include <linux/writeback.h> 25#include <linux/writeback.h>
26#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
27#include <linux/bio.h> 27#include <linux/bio.h>
28#include <linux/blkdev.h>
28#include <trace/events/jbd2.h> 29#include <trace/events/jbd2.h>
29 30
30/* 31/*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
133 bh->b_end_io = journal_end_buffer_io_sync; 134 bh->b_end_io = journal_end_buffer_io_sync;
134 135
135 if (journal->j_flags & JBD2_BARRIER && 136 if (journal->j_flags & JBD2_BARRIER &&
136 !JBD2_HAS_INCOMPAT_FEATURE(journal, 137 !JBD2_HAS_INCOMPAT_FEATURE(journal,
137 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 138 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 set_buffer_ordered(bh); 139 set_buffer_ordered(bh);
139 barrier_done = 1; 140 barrier_done = 1;
140 } 141 }
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
220 .nr_to_write = mapping->nrpages * 2, 221 .nr_to_write = mapping->nrpages * 2,
221 .range_start = 0, 222 .range_start = 0,
222 .range_end = i_size_read(mapping->host), 223 .range_end = i_size_read(mapping->host),
223 .for_writepages = 1,
224 }; 224 };
225 225
226 ret = generic_writepages(mapping, &wbc); 226 ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
707 /* Done it all: now write the commit record asynchronously. */ 707 /* Done it all: now write the commit record asynchronously. */
708 708
709 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 709 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 710 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
711 err = journal_submit_commit_record(journal, commit_transaction, 711 err = journal_submit_commit_record(journal, commit_transaction,
712 &cbh, crc32_sum); 712 &cbh, crc32_sum);
713 if (err) 713 if (err)
714 __jbd2_journal_abort_hard(journal); 714 __jbd2_journal_abort_hard(journal);
715 if (journal->j_flags & JBD2_BARRIER)
716 blkdev_issue_flush(journal->j_dev, NULL);
715 } 717 }
716 718
717 /* 719 /*
@@ -834,7 +836,7 @@ wait_for_iobuf:
834 jbd_debug(3, "JBD: commit phase 5\n"); 836 jbd_debug(3, "JBD: commit phase 5\n");
835 837
836 if (!JBD2_HAS_INCOMPAT_FEATURE(journal, 838 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
837 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { 839 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
838 err = journal_submit_commit_record(journal, commit_transaction, 840 err = journal_submit_commit_record(journal, commit_transaction,
839 &cbh, crc32_sum); 841 &cbh, crc32_sum);
840 if (err) 842 if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
1187 1187
1188 first = be32_to_cpu(sb->s_first); 1188 first = be32_to_cpu(sb->s_first);
1189 last = be32_to_cpu(sb->s_maxlen); 1189 last = be32_to_cpu(sb->s_maxlen);
1190 if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
1191 printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
1192 first, last);
1193 journal_fail_superblock(journal);
1194 return -EINVAL;
1195 }
1190 1196
1191 journal->j_first = first; 1197 journal->j_first = first;
1192 journal->j_last = last; 1198 journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
57 INIT_LIST_HEAD(&transaction->t_private_list); 57 INIT_LIST_HEAD(&transaction->t_private_list);
58 58
59 /* Set up the commit timer for the new transaction. */ 59 /* Set up the commit timer for the new transaction. */
60 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 60 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
61 add_timer(&journal->j_commit_timer); 61 add_timer(&journal->j_commit_timer);
62 62
63 J_ASSERT(journal->j_running_transaction == NULL); 63 J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
238 __jbd2_log_space_left(journal)); 238 __jbd2_log_space_left(journal));
239 spin_unlock(&transaction->t_handle_lock); 239 spin_unlock(&transaction->t_handle_lock);
240 spin_unlock(&journal->j_state_lock); 240 spin_unlock(&journal->j_state_lock);
241
242 lock_map_acquire(&handle->h_lockdep_map);
241out: 243out:
242 if (unlikely(new_transaction)) /* It's usually NULL */ 244 if (unlikely(new_transaction)) /* It's usually NULL */
243 kfree(new_transaction); 245 kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
303 handle = ERR_PTR(err); 305 handle = ERR_PTR(err);
304 goto out; 306 goto out;
305 } 307 }
306
307 lock_map_acquire(&handle->h_lockdep_map);
308out: 308out:
309 return handle; 309 return handle;
310} 310}
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
426 __jbd2_log_start_commit(journal, transaction->t_tid); 426 __jbd2_log_start_commit(journal, transaction->t_tid);
427 spin_unlock(&journal->j_state_lock); 427 spin_unlock(&journal->j_state_lock);
428 428
429 lock_map_release(&handle->h_lockdep_map);
429 handle->h_buffer_credits = nblocks; 430 handle->h_buffer_credits = nblocks;
430 ret = start_this_handle(journal, handle); 431 ret = start_this_handle(journal, handle);
431 return ret; 432 return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
258 return rc; 258 return rc;
259} 259}
260 260
261static int jffs2_check_acl(struct inode *inode, int mask) 261int jffs2_check_acl(struct inode *inode, int mask)
262{ 262{
263 struct posix_acl *acl; 263 struct posix_acl *acl;
264 int rc; 264 int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
274 return -EAGAIN; 274 return -EAGAIN;
275} 275}
276 276
277int jffs2_permission(struct inode *inode, int mask)
278{
279 return generic_permission(inode, mask, jffs2_check_acl);
280}
281
282int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) 277int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
283{ 278{
284 struct posix_acl *acl, *clone; 279 struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
26 26
27#ifdef CONFIG_JFFS2_FS_POSIX_ACL 27#ifdef CONFIG_JFFS2_FS_POSIX_ACL
28 28
29extern int jffs2_permission(struct inode *, int); 29extern int jffs2_check_acl(struct inode *, int);
30extern int jffs2_acl_chmod(struct inode *); 30extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
39#define jffs2_permission (NULL) 39#define jffs2_check_acl (NULL)
40#define jffs2_acl_chmod(inode) (0) 40#define jffs2_acl_chmod(inode) (0)
41#define jffs2_init_acl_pre(dir_i,inode,mode) (0) 41#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
42#define jffs2_init_acl_post(inode) (0) 42#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
55 .rmdir = jffs2_rmdir, 55 .rmdir = jffs2_rmdir,
56 .mknod = jffs2_mknod, 56 .mknod = jffs2_mknod,
57 .rename = jffs2_rename, 57 .rename = jffs2_rename,
58 .permission = jffs2_permission, 58 .check_acl = jffs2_check_acl,
59 .setattr = jffs2_setattr, 59 .setattr = jffs2_setattr,
60 .setxattr = jffs2_setxattr, 60 .setxattr = jffs2_setxattr,
61 .getxattr = jffs2_getxattr, 61 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 23c947539864..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
56 56
57const struct inode_operations jffs2_file_inode_operations = 57const struct inode_operations jffs2_file_inode_operations =
58{ 58{
59 .permission = jffs2_permission, 59 .check_acl = jffs2_check_acl,
60 .setattr = jffs2_setattr, 60 .setattr = jffs2_setattr,
61 .setxattr = jffs2_setxattr, 61 .setxattr = jffs2_setxattr,
62 .getxattr = jffs2_getxattr, 62 .getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
21{ 21{
22 .readlink = generic_readlink, 22 .readlink = generic_readlink,
23 .follow_link = jffs2_follow_link, 23 .follow_link = jffs2_follow_link,
24 .permission = jffs2_permission, 24 .check_acl = jffs2_check_acl,
25 .setattr = jffs2_setattr, 25 .setattr = jffs2_setattr,
26 .setxattr = jffs2_setxattr, 26 .setxattr = jffs2_setxattr,
27 .getxattr = jffs2_getxattr, 27 .getxattr = jffs2_getxattr,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index a29c7c3e3fb8..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -114,7 +114,7 @@ out:
114 return rc; 114 return rc;
115} 115}
116 116
117static int jfs_check_acl(struct inode *inode, int mask) 117int jfs_check_acl(struct inode *inode, int mask)
118{ 118{
119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); 119 struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
120 120
@@ -129,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
129 return -EAGAIN; 129 return -EAGAIN;
130} 130}
131 131
132int jfs_permission(struct inode *inode, int mask)
133{
134 return generic_permission(inode, mask, jfs_check_acl);
135}
136
137int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) 132int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
138{ 133{
139 struct posix_acl *acl = NULL; 134 struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
96 .removexattr = jfs_removexattr, 96 .removexattr = jfs_removexattr,
97#ifdef CONFIG_JFS_POSIX_ACL 97#ifdef CONFIG_JFS_POSIX_ACL
98 .setattr = jfs_setattr, 98 .setattr = jfs_setattr,
99 .permission = jfs_permission, 99 .check_acl = jfs_check_acl,
100#endif 100#endif
101}; 101};
102 102
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
20 20
21#ifdef CONFIG_JFS_POSIX_ACL 21#ifdef CONFIG_JFS_POSIX_ACL
22 22
23int jfs_permission(struct inode *, int); 23int jfs_check_acl(struct inode *, int);
24int jfs_init_acl(tid_t, struct inode *, struct inode *); 24int jfs_init_acl(tid_t, struct inode *, struct inode *);
25int jfs_setattr(struct dentry *, struct iattr *); 25int jfs_setattr(struct dentry *, struct iattr *);
26 26
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
1543 .removexattr = jfs_removexattr, 1543 .removexattr = jfs_removexattr,
1544#ifdef CONFIG_JFS_POSIX_ACL 1544#ifdef CONFIG_JFS_POSIX_ACL
1545 .setattr = jfs_setattr, 1545 .setattr = jfs_setattr,
1546 .permission = jfs_permission, 1546 .check_acl = jfs_check_acl,
1547#endif 1547#endif
1548}; 1548};
1549 1549
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..7cb076ac6b45 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
87 return hash & (NLM_HOST_NRHASH - 1); 87 return hash & (NLM_HOST_NRHASH - 1);
88} 88}
89 89
90static void nlm_clear_port(struct sockaddr *sap)
91{
92 switch (sap->sa_family) {
93 case AF_INET:
94 ((struct sockaddr_in *)sap)->sin_port = 0;
95 break;
96 case AF_INET6:
97 ((struct sockaddr_in6 *)sap)->sin6_port = 0;
98 break;
99 }
100}
101
102/* 90/*
103 * Common host lookup routine for server & client 91 * Common host lookup routine for server & client
104 */ 92 */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
177 host->h_addrbuf = nsm->sm_addrbuf; 165 host->h_addrbuf = nsm->sm_addrbuf;
178 memcpy(nlm_addr(host), ni->sap, ni->salen); 166 memcpy(nlm_addr(host), ni->sap, ni->salen);
179 host->h_addrlen = ni->salen; 167 host->h_addrlen = ni->salen;
180 nlm_clear_port(nlm_addr(host)); 168 rpc_set_port(nlm_addr(host), 0);
181 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); 169 memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
182 host->h_version = ni->version; 170 host->h_version = ni->version;
183 host->h_proto = ni->protocol; 171 host->h_proto = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..30c933188dd7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
61 return (struct sockaddr *)&nsm->sm_addr; 61 return (struct sockaddr *)&nsm->sm_addr;
62} 62}
63 63
64static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
65 const size_t len)
66{
67 const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
68 snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
69}
70
71static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
72 const size_t len)
73{
74 const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
75
76 if (ipv6_addr_v4mapped(&sin6->sin6_addr))
77 snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
78 else if (sin6->sin6_scope_id != 0)
79 snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
80 sin6->sin6_scope_id);
81 else
82 snprintf(buf, len, "%pI6", &sin6->sin6_addr);
83}
84
85static void nsm_display_address(const struct sockaddr *sap,
86 char *buf, const size_t len)
87{
88 switch (sap->sa_family) {
89 case AF_INET:
90 nsm_display_ipv4_address(sap, buf, len);
91 break;
92 case AF_INET6:
93 nsm_display_ipv6_address(sap, buf, len);
94 break;
95 default:
96 snprintf(buf, len, "unsupported address family");
97 break;
98 }
99}
100
101static struct rpc_clnt *nsm_create(void) 64static struct rpc_clnt *nsm_create(void)
102{ 65{
103 struct sockaddr_in sin = { 66 struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
307 memcpy(nsm_addr(new), sap, salen); 270 memcpy(nsm_addr(new), sap, salen);
308 new->sm_addrlen = salen; 271 new->sm_addrlen = salen;
309 nsm_init_private(new); 272 nsm_init_private(new);
310 nsm_display_address((const struct sockaddr *)&new->sm_addr, 273
311 new->sm_addrbuf, sizeof(new->sm_addrbuf)); 274 if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
275 sizeof(new->sm_addrbuf)) == 0)
276 (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
277 "unsupported address family");
312 memcpy(new->sm_name, hostname, hostname_len); 278 memcpy(new->sm_name, hostname, hostname_len);
313 new->sm_name[hostname_len] = '\0'; 279 new->sm_name[hostname_len] = '\0';
314 280
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..19ee18a6829b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
768 * give it the opportunity to lock the file. 768 * give it the opportunity to lock the file.
769 */ 769 */
770 if (found) 770 if (found)
771 cond_resched_bkl(); 771 cond_resched();
772 772
773find_conflict: 773find_conflict:
774 for_each_lock(inode, before) { 774 for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1591 if (can_sleep) 1591 if (can_sleep)
1592 lock->fl_flags |= FL_SLEEP; 1592 lock->fl_flags |= FL_SLEEP;
1593 1593
1594 error = security_file_lock(filp, cmd); 1594 error = security_file_lock(filp, lock->fl_type);
1595 if (error) 1595 if (error)
1596 goto out_free; 1596 goto out_free;
1597 1597
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
169EXPORT_SYMBOL(putname); 169EXPORT_SYMBOL(putname);
170#endif 170#endif
171 171
172 172/*
173/** 173 * This does basic POSIX ACL permission checking
174 * generic_permission - check for access rights on a Posix-like filesystem
175 * @inode: inode to check access rights for
176 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
177 * @check_acl: optional callback to check for Posix ACLs
178 *
179 * Used to check for read/write/execute permissions on a file.
180 * We use "fsuid" for this, letting us set arbitrary permissions
181 * for filesystem access without changing the "normal" uids which
182 * are used for other things..
183 */ 174 */
184int generic_permission(struct inode *inode, int mask, 175static int acl_permission_check(struct inode *inode, int mask,
185 int (*check_acl)(struct inode *inode, int mask)) 176 int (*check_acl)(struct inode *inode, int mask))
186{ 177{
187 umode_t mode = inode->i_mode; 178 umode_t mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
193 else { 184 else {
194 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { 185 if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
195 int error = check_acl(inode, mask); 186 int error = check_acl(inode, mask);
196 if (error == -EACCES) 187 if (error != -EAGAIN)
197 goto check_capabilities;
198 else if (error != -EAGAIN)
199 return error; 188 return error;
200 } 189 }
201 190
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
208 */ 197 */
209 if ((mask & ~mode) == 0) 198 if ((mask & ~mode) == 0)
210 return 0; 199 return 0;
200 return -EACCES;
201}
202
203/**
204 * generic_permission - check for access rights on a Posix-like filesystem
205 * @inode: inode to check access rights for
206 * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
207 * @check_acl: optional callback to check for Posix ACLs
208 *
209 * Used to check for read/write/execute permissions on a file.
210 * We use "fsuid" for this, letting us set arbitrary permissions
211 * for filesystem access without changing the "normal" uids which
212 * are used for other things..
213 */
214int generic_permission(struct inode *inode, int mask,
215 int (*check_acl)(struct inode *inode, int mask))
216{
217 int ret;
218
219 /*
220 * Do the basic POSIX ACL permission checks.
221 */
222 ret = acl_permission_check(inode, mask, check_acl);
223 if (ret != -EACCES)
224 return ret;
211 225
212 check_capabilities:
213 /* 226 /*
214 * Read/write DACs are always overridable. 227 * Read/write DACs are always overridable.
215 * Executable DACs are overridable if at least one exec bit is set. 228 * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
262 if (inode->i_op->permission) 275 if (inode->i_op->permission)
263 retval = inode->i_op->permission(inode, mask); 276 retval = inode->i_op->permission(inode, mask);
264 else 277 else
265 retval = generic_permission(inode, mask, NULL); 278 retval = generic_permission(inode, mask, inode->i_op->check_acl);
266 279
267 if (retval) 280 if (retval)
268 return retval; 281 return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
432 */ 445 */
433static int exec_permission_lite(struct inode *inode) 446static int exec_permission_lite(struct inode *inode)
434{ 447{
435 umode_t mode = inode->i_mode; 448 int ret;
436 449
437 if (inode->i_op->permission) 450 if (inode->i_op->permission) {
438 return -EAGAIN; 451 ret = inode->i_op->permission(inode, MAY_EXEC);
439 452 if (!ret)
440 if (current_fsuid() == inode->i_uid) 453 goto ok;
441 mode >>= 6; 454 return ret;
442 else if (in_group_p(inode->i_gid)) 455 }
443 mode >>= 3; 456 ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
444 457 if (!ret)
445 if (mode & MAY_EXEC)
446 goto ok;
447
448 if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
449 goto ok;
450
451 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
452 goto ok; 458 goto ok;
453 459
454 if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) 460 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
455 goto ok; 461 goto ok;
456 462
457 return -EACCES; 463 return ret;
458ok: 464ok:
459 return security_inode_permission(inode, MAY_EXEC); 465 return security_inode_permission(inode, MAY_EXEC);
460} 466}
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
853 859
854 nd->flags |= LOOKUP_CONTINUE; 860 nd->flags |= LOOKUP_CONTINUE;
855 err = exec_permission_lite(inode); 861 err = exec_permission_lite(inode);
856 if (err == -EAGAIN)
857 err = inode_permission(nd->path.dentry->d_inode,
858 MAY_EXEC);
859 if (!err)
860 err = ima_path_check(&nd->path, MAY_EXEC,
861 IMA_COUNT_UPDATE);
862 if (err) 862 if (err)
863 break; 863 break;
864 864
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
1533 if (error) 1533 if (error)
1534 return error; 1534 return error;
1535 1535
1536 error = ima_path_check(path, 1536 error = ima_path_check(path, acc_mode ?
1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), 1537 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1538 ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
1538 IMA_COUNT_UPDATE); 1539 IMA_COUNT_UPDATE);
1540
1539 if (error) 1541 if (error)
1540 return error; 1542 return error;
1541 /* 1543 /*
1542 * An append-only file must be opened in append mode for writing. 1544 * An append-only file must be opened in append mode for writing.
1543 */ 1545 */
1544 if (IS_APPEND(inode)) { 1546 if (IS_APPEND(inode)) {
1547 error = -EPERM;
1545 if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) 1548 if ((flag & FMODE_WRITE) && !(flag & O_APPEND))
1546 return -EPERM; 1549 goto err_out;
1547 if (flag & O_TRUNC) 1550 if (flag & O_TRUNC)
1548 return -EPERM; 1551 goto err_out;
1549 } 1552 }
1550 1553
1551 /* O_NOATIME can only be set by the owner or superuser */ 1554 /* O_NOATIME can only be set by the owner or superuser */
1552 if (flag & O_NOATIME) 1555 if (flag & O_NOATIME)
1553 if (!is_owner_or_cap(inode)) 1556 if (!is_owner_or_cap(inode)) {
1554 return -EPERM; 1557 error = -EPERM;
1558 goto err_out;
1559 }
1555 1560
1556 /* 1561 /*
1557 * Ensure there are no outstanding leases on the file. 1562 * Ensure there are no outstanding leases on the file.
1558 */ 1563 */
1559 error = break_lease(inode, flag); 1564 error = break_lease(inode, flag);
1560 if (error) 1565 if (error)
1561 return error; 1566 goto err_out;
1562 1567
1563 if (flag & O_TRUNC) { 1568 if (flag & O_TRUNC) {
1564 error = get_write_access(inode); 1569 error = get_write_access(inode);
1565 if (error) 1570 if (error)
1566 return error; 1571 goto err_out;
1567 1572
1568 /* 1573 /*
1569 * Refuse to truncate files with mandatory locks held on them. 1574 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
1581 } 1586 }
1582 put_write_access(inode); 1587 put_write_access(inode);
1583 if (error) 1588 if (error)
1584 return error; 1589 goto err_out;
1585 } else 1590 } else
1586 if (flag & FMODE_WRITE) 1591 if (flag & FMODE_WRITE)
1587 vfs_dq_init(inode); 1592 vfs_dq_init(inode);
1588 1593
1589 return 0; 1594 return 0;
1595err_out:
1596 ima_counts_put(path, acc_mode ?
1597 acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
1598 ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
1599 return error;
1590} 1600}
1591 1601
1592/* 1602/*
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ 7nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \ 8 direct.o pagelist.o proc.o read.o symlink.o unlink.o \
9 write.o namespace.o mount_clnt.o 9 write.o namespace.o mount_clnt.o \
10 dns_resolve.o cache_lib.o
10nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
11nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o 12nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o
12nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o 13nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
1/*
2 * linux/fs/nfs/cache_lib.c
3 *
4 * Helper routines for the NFS client caches
5 *
6 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
7 */
8#include <linux/kmod.h>
9#include <linux/module.h>
10#include <linux/moduleparam.h>
11#include <linux/mount.h>
12#include <linux/namei.h>
13#include <linux/sunrpc/cache.h>
14#include <linux/sunrpc/rpc_pipe_fs.h>
15
16#include "cache_lib.h"
17
18#define NFS_CACHE_UPCALL_PATHLEN 256
19#define NFS_CACHE_UPCALL_TIMEOUT 15
20
21static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
22 "/sbin/nfs_cache_getent";
23static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
24
25module_param_string(cache_getent, nfs_cache_getent_prog,
26 sizeof(nfs_cache_getent_prog), 0600);
27MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
28module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
29MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
30 "the cache upcall is assumed to have failed");
31
32int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
33{
34 static char *envp[] = { "HOME=/",
35 "TERM=linux",
36 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
37 NULL
38 };
39 char *argv[] = {
40 nfs_cache_getent_prog,
41 cd->name,
42 entry_name,
43 NULL
44 };
45 int ret = -EACCES;
46
47 if (nfs_cache_getent_prog[0] == '\0')
48 goto out;
49 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
50 /*
51 * Disable the upcall mechanism if we're getting an ENOENT or
52 * EACCES error. The admin can re-enable it on the fly by using
53 * sysfs to set the 'cache_getent' parameter once the problem
54 * has been fixed.
55 */
56 if (ret == -ENOENT || ret == -EACCES)
57 nfs_cache_getent_prog[0] = '\0';
58out:
59 return ret > 0 ? 0 : ret;
60}
61
62/*
63 * Deferred request handling
64 */
65void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
66{
67 if (atomic_dec_and_test(&dreq->count))
68 kfree(dreq);
69}
70
71static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
72{
73 struct nfs_cache_defer_req *dreq;
74
75 dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
76
77 complete_all(&dreq->completion);
78 nfs_cache_defer_req_put(dreq);
79}
80
81static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
82{
83 struct nfs_cache_defer_req *dreq;
84
85 dreq = container_of(req, struct nfs_cache_defer_req, req);
86 dreq->deferred_req.revisit = nfs_dns_cache_revisit;
87 atomic_inc(&dreq->count);
88
89 return &dreq->deferred_req;
90}
91
92struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
93{
94 struct nfs_cache_defer_req *dreq;
95
96 dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
97 if (dreq) {
98 init_completion(&dreq->completion);
99 atomic_set(&dreq->count, 1);
100 dreq->req.defer = nfs_dns_cache_defer;
101 }
102 return dreq;
103}
104
105int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
106{
107 if (wait_for_completion_timeout(&dreq->completion,
108 nfs_cache_getent_timeout * HZ) == 0)
109 return -ETIMEDOUT;
110 return 0;
111}
112
113int nfs_cache_register(struct cache_detail *cd)
114{
115 struct nameidata nd;
116 struct vfsmount *mnt;
117 int ret;
118
119 mnt = rpc_get_mount();
120 if (IS_ERR(mnt))
121 return PTR_ERR(mnt);
122 ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
123 if (ret)
124 goto err;
125 ret = sunrpc_cache_register_pipefs(nd.path.dentry,
126 cd->name, 0600, cd);
127 path_put(&nd.path);
128 if (!ret)
129 return ret;
130err:
131 rpc_put_mount();
132 return ret;
133}
134
135void nfs_cache_unregister(struct cache_detail *cd)
136{
137 sunrpc_cache_unregister_pipefs(cd);
138 rpc_put_mount();
139}
140
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
1/*
2 * Helper routines for the NFS client caches
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 */
6
7#include <linux/completion.h>
8#include <linux/sunrpc/cache.h>
9#include <asm/atomic.h>
10
11/*
12 * Deferred request handling
13 */
14struct nfs_cache_defer_req {
15 struct cache_req req;
16 struct cache_deferred_req deferred_req;
17 struct completion completion;
18 atomic_t count;
19};
20
21extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
22extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
23extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
24extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
25
26extern int nfs_cache_register(struct cache_detail *cd);
27extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
43unsigned int nfs_callback_set_tcpport; 43unsigned int nfs_callback_set_tcpport;
44unsigned short nfs_callback_tcpport; 44unsigned short nfs_callback_tcpport;
45unsigned short nfs_callback_tcpport6; 45unsigned short nfs_callback_tcpport6;
46static const int nfs_set_port_min = 0; 46#define NFS_CALLBACK_MAXPORTNR (65535U)
47static const int nfs_set_port_max = 65535;
48 47
49static int param_set_port(const char *val, struct kernel_param *kp) 48static int param_set_portnr(const char *val, struct kernel_param *kp)
50{ 49{
51 char *endp; 50 unsigned long num;
52 int num = simple_strtol(val, &endp, 0); 51 int ret;
53 if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) 52
53 if (!val)
54 return -EINVAL;
55 ret = strict_strtoul(val, 0, &num);
56 if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
54 return -EINVAL; 57 return -EINVAL;
55 *((int *)kp->arg) = num; 58 *((unsigned int *)kp->arg) = num;
56 return 0; 59 return 0;
57} 60}
58 61
59module_param_call(callback_tcpport, param_set_port, param_get_int, 62static int param_get_portnr(char *buffer, struct kernel_param *kp)
60 &nfs_callback_set_tcpport, 0644); 63{
64 return param_get_uint(buffer, kp);
65}
66#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
67
68module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
61 69
62/* 70/*
63 * This is the NFSv4 callback kernel thread. 71 * This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..e350bd6a2334 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
809 /* Initialise the client representation from the mount data */ 809 /* Initialise the client representation from the mount data */
810 server->flags = data->flags; 810 server->flags = data->flags;
811 server->options = data->options; 811 server->options = data->options;
812 server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
813 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
814 NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
812 815
813 if (data->rsize) 816 if (data->rsize)
814 server->rsize = nfs_block_size(data->rsize, NULL); 817 server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
879 server->rsize = NFS_MAX_FILE_IO_SIZE; 882 server->rsize = NFS_MAX_FILE_IO_SIZE;
880 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 883 server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
881 884
885 server->backing_dev_info.name = "nfs";
882 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; 886 server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
883 887
884 if (server->wsize > max_rpc_payload) 888 if (server->wsize > max_rpc_payload)
@@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
1074 (unsigned long long) server->fsid.major, 1078 (unsigned long long) server->fsid.major,
1075 (unsigned long long) server->fsid.minor); 1079 (unsigned long long) server->fsid.minor);
1076 1080
1077 BUG_ON(!server->nfs_client);
1078 BUG_ON(!server->nfs_client->rpc_ops);
1079 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1080
1081 spin_lock(&nfs_client_lock); 1081 spin_lock(&nfs_client_lock);
1082 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1082 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1083 list_add_tail(&server->master_link, &nfs_volume_list); 1083 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server,
1274 1274
1275 /* Initialise the client representation from the mount data */ 1275 /* Initialise the client representation from the mount data */
1276 server->flags = data->flags; 1276 server->flags = data->flags;
1277 server->caps |= NFS_CAP_ATOMIC_OPEN; 1277 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1278 server->options = data->options; 1278 server->options = data->options;
1279 1279
1280 /* Get a client record */ 1280 /* Get a client record */
@@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
1359 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) 1359 if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
1360 server->namelen = NFS4_MAXNAMLEN; 1360 server->namelen = NFS4_MAXNAMLEN;
1361 1361
1362 BUG_ON(!server->nfs_client);
1363 BUG_ON(!server->nfs_client->rpc_ops);
1364 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1365
1366 spin_lock(&nfs_client_lock); 1362 spin_lock(&nfs_client_lock);
1367 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); 1363 list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
1368 list_add_tail(&server->master_link, &nfs_volume_list); 1364 list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
1400 1396
1401 /* Initialise the client representation from the parent server */ 1397 /* Initialise the client representation from the parent server */
1402 nfs_server_copy_userdata(server, parent_server); 1398 nfs_server_copy_userdata(server, parent_server);
1403 server->caps |= NFS_CAP_ATOMIC_OPEN; 1399 server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
1404 1400
1405 /* Get a client representation. 1401 /* Get a client representation.
1406 * Note: NFSv4 always uses TCP, */ 1402 * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e4e089a8f294..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -934,9 +934,6 @@ out:
934 * back into its cache. We let the server do generic write 934 * back into its cache. We let the server do generic write
935 * parameter checking and report problems. 935 * parameter checking and report problems.
936 * 936 *
937 * We also avoid an unnecessary invocation of generic_osync_inode(),
938 * as it is fairly meaningless to sync the metadata of an NFS file.
939 *
940 * We eliminate local atime updates, see direct read above. 937 * We eliminate local atime updates, see direct read above.
941 * 938 *
942 * We avoid unnecessary page cache invalidations for normal cached 939 * We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
1/*
2 * linux/fs/nfs/dns_resolve.c
3 *
4 * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
5 *
6 * Resolves DNS hostnames into valid ip addresses
7 */
8
9#include <linux/hash.h>
10#include <linux/string.h>
11#include <linux/kmod.h>
12#include <linux/module.h>
13#include <linux/socket.h>
14#include <linux/seq_file.h>
15#include <linux/inet.h>
16#include <linux/sunrpc/clnt.h>
17#include <linux/sunrpc/cache.h>
18#include <linux/sunrpc/svcauth.h>
19
20#include "dns_resolve.h"
21#include "cache_lib.h"
22
23#define NFS_DNS_HASHBITS 4
24#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
25
26static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
27
28struct nfs_dns_ent {
29 struct cache_head h;
30
31 char *hostname;
32 size_t namelen;
33
34 struct sockaddr_storage addr;
35 size_t addrlen;
36};
37
38
39static void nfs_dns_ent_init(struct cache_head *cnew,
40 struct cache_head *ckey)
41{
42 struct nfs_dns_ent *new;
43 struct nfs_dns_ent *key;
44
45 new = container_of(cnew, struct nfs_dns_ent, h);
46 key = container_of(ckey, struct nfs_dns_ent, h);
47
48 kfree(new->hostname);
49 new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
50 if (new->hostname) {
51 new->namelen = key->namelen;
52 memcpy(&new->addr, &key->addr, key->addrlen);
53 new->addrlen = key->addrlen;
54 } else {
55 new->namelen = 0;
56 new->addrlen = 0;
57 }
58}
59
60static void nfs_dns_ent_put(struct kref *ref)
61{
62 struct nfs_dns_ent *item;
63
64 item = container_of(ref, struct nfs_dns_ent, h.ref);
65 kfree(item->hostname);
66 kfree(item);
67}
68
69static struct cache_head *nfs_dns_ent_alloc(void)
70{
71 struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
72
73 if (item != NULL) {
74 item->hostname = NULL;
75 item->namelen = 0;
76 item->addrlen = 0;
77 return &item->h;
78 }
79 return NULL;
80};
81
82static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
83{
84 return hash_str(key->hostname, NFS_DNS_HASHBITS);
85}
86
87static void nfs_dns_request(struct cache_detail *cd,
88 struct cache_head *ch,
89 char **bpp, int *blen)
90{
91 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
92
93 qword_add(bpp, blen, key->hostname);
94 (*bpp)[-1] = '\n';
95}
96
97static int nfs_dns_upcall(struct cache_detail *cd,
98 struct cache_head *ch)
99{
100 struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
101 int ret;
102
103 ret = nfs_cache_upcall(cd, key->hostname);
104 if (ret)
105 ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
106 return ret;
107}
108
109static int nfs_dns_match(struct cache_head *ca,
110 struct cache_head *cb)
111{
112 struct nfs_dns_ent *a;
113 struct nfs_dns_ent *b;
114
115 a = container_of(ca, struct nfs_dns_ent, h);
116 b = container_of(cb, struct nfs_dns_ent, h);
117
118 if (a->namelen == 0 || a->namelen != b->namelen)
119 return 0;
120 return memcmp(a->hostname, b->hostname, a->namelen) == 0;
121}
122
123static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
124 struct cache_head *h)
125{
126 struct nfs_dns_ent *item;
127 long ttl;
128
129 if (h == NULL) {
130 seq_puts(m, "# ip address hostname ttl\n");
131 return 0;
132 }
133 item = container_of(h, struct nfs_dns_ent, h);
134 ttl = (long)item->h.expiry_time - (long)get_seconds();
135 if (ttl < 0)
136 ttl = 0;
137
138 if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
139 char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
140
141 rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
142 seq_printf(m, "%15s ", buf);
143 } else
144 seq_puts(m, "<none> ");
145 seq_printf(m, "%15s %ld\n", item->hostname, ttl);
146 return 0;
147}
148
149struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
150 struct nfs_dns_ent *key)
151{
152 struct cache_head *ch;
153
154 ch = sunrpc_cache_lookup(cd,
155 &key->h,
156 nfs_dns_hash(key));
157 if (!ch)
158 return NULL;
159 return container_of(ch, struct nfs_dns_ent, h);
160}
161
162struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
163 struct nfs_dns_ent *new,
164 struct nfs_dns_ent *key)
165{
166 struct cache_head *ch;
167
168 ch = sunrpc_cache_update(cd,
169 &new->h, &key->h,
170 nfs_dns_hash(key));
171 if (!ch)
172 return NULL;
173 return container_of(ch, struct nfs_dns_ent, h);
174}
175
176static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
177{
178 char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
179 struct nfs_dns_ent key, *item;
180 unsigned long ttl;
181 ssize_t len;
182 int ret = -EINVAL;
183
184 if (buf[buflen-1] != '\n')
185 goto out;
186 buf[buflen-1] = '\0';
187
188 len = qword_get(&buf, buf1, sizeof(buf1));
189 if (len <= 0)
190 goto out;
191 key.addrlen = rpc_pton(buf1, len,
192 (struct sockaddr *)&key.addr,
193 sizeof(key.addr));
194
195 len = qword_get(&buf, buf1, sizeof(buf1));
196 if (len <= 0)
197 goto out;
198
199 key.hostname = buf1;
200 key.namelen = len;
201 memset(&key.h, 0, sizeof(key.h));
202
203 ttl = get_expiry(&buf);
204 if (ttl == 0)
205 goto out;
206 key.h.expiry_time = ttl + get_seconds();
207
208 ret = -ENOMEM;
209 item = nfs_dns_lookup(cd, &key);
210 if (item == NULL)
211 goto out;
212
213 if (key.addrlen == 0)
214 set_bit(CACHE_NEGATIVE, &key.h.flags);
215
216 item = nfs_dns_update(cd, &key, item);
217 if (item == NULL)
218 goto out;
219
220 ret = 0;
221 cache_put(&item->h, cd);
222out:
223 return ret;
224}
225
226static struct cache_detail nfs_dns_resolve = {
227 .owner = THIS_MODULE,
228 .hash_size = NFS_DNS_HASHTBL_SIZE,
229 .hash_table = nfs_dns_table,
230 .name = "dns_resolve",
231 .cache_put = nfs_dns_ent_put,
232 .cache_upcall = nfs_dns_upcall,
233 .cache_parse = nfs_dns_parse,
234 .cache_show = nfs_dns_show,
235 .match = nfs_dns_match,
236 .init = nfs_dns_ent_init,
237 .update = nfs_dns_ent_init,
238 .alloc = nfs_dns_ent_alloc,
239};
240
241static int do_cache_lookup(struct cache_detail *cd,
242 struct nfs_dns_ent *key,
243 struct nfs_dns_ent **item,
244 struct nfs_cache_defer_req *dreq)
245{
246 int ret = -ENOMEM;
247
248 *item = nfs_dns_lookup(cd, key);
249 if (*item) {
250 ret = cache_check(cd, &(*item)->h, &dreq->req);
251 if (ret)
252 *item = NULL;
253 }
254 return ret;
255}
256
257static int do_cache_lookup_nowait(struct cache_detail *cd,
258 struct nfs_dns_ent *key,
259 struct nfs_dns_ent **item)
260{
261 int ret = -ENOMEM;
262
263 *item = nfs_dns_lookup(cd, key);
264 if (!*item)
265 goto out_err;
266 ret = -ETIMEDOUT;
267 if (!test_bit(CACHE_VALID, &(*item)->h.flags)
268 || (*item)->h.expiry_time < get_seconds()
269 || cd->flush_time > (*item)->h.last_refresh)
270 goto out_put;
271 ret = -ENOENT;
272 if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
273 goto out_put;
274 return 0;
275out_put:
276 cache_put(&(*item)->h, cd);
277out_err:
278 *item = NULL;
279 return ret;
280}
281
282static int do_cache_lookup_wait(struct cache_detail *cd,
283 struct nfs_dns_ent *key,
284 struct nfs_dns_ent **item)
285{
286 struct nfs_cache_defer_req *dreq;
287 int ret = -ENOMEM;
288
289 dreq = nfs_cache_defer_req_alloc();
290 if (!dreq)
291 goto out;
292 ret = do_cache_lookup(cd, key, item, dreq);
293 if (ret == -EAGAIN) {
294 ret = nfs_cache_wait_for_upcall(dreq);
295 if (!ret)
296 ret = do_cache_lookup_nowait(cd, key, item);
297 }
298 nfs_cache_defer_req_put(dreq);
299out:
300 return ret;
301}
302
303ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
304 struct sockaddr *sa, size_t salen)
305{
306 struct nfs_dns_ent key = {
307 .hostname = name,
308 .namelen = namelen,
309 };
310 struct nfs_dns_ent *item = NULL;
311 ssize_t ret;
312
313 ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
314 if (ret == 0) {
315 if (salen >= item->addrlen) {
316 memcpy(sa, &item->addr, item->addrlen);
317 ret = item->addrlen;
318 } else
319 ret = -EOVERFLOW;
320 cache_put(&item->h, &nfs_dns_resolve);
321 } else if (ret == -ENOENT)
322 ret = -ESRCH;
323 return ret;
324}
325
326int nfs_dns_resolver_init(void)
327{
328 return nfs_cache_register(&nfs_dns_resolve);
329}
330
331void nfs_dns_resolver_destroy(void)
332{
333 nfs_cache_unregister(&nfs_dns_resolve);
334}
335
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
1/*
2 * Resolve DNS hostnames into valid ip addresses
3 */
4#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
5#define __LINUX_FS_NFS_DNS_RESOLVE_H
6
7#define NFS_DNS_HOSTNAME_MAXLEN (128)
8
9extern int nfs_dns_resolver_init(void);
10extern void nfs_dns_resolver_destroy(void);
11extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
12 struct sockaddr *sa, size_t salen);
13
14#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
328} 328}
329 329
330/* 330/*
331 * Decide whether a read/modify/write cycle may be more efficient
332 * then a modify/write/read cycle when writing to a page in the
333 * page cache.
334 *
335 * The modify/write/read cycle may occur if a page is read before
336 * being completely filled by the writer. In this situation, the
337 * page must be completely written to stable storage on the server
338 * before it can be refilled by reading in the page from the server.
339 * This can lead to expensive, small, FILE_SYNC mode writes being
340 * done.
341 *
342 * It may be more efficient to read the page first if the file is
343 * open for reading in addition to writing, the page is not marked
344 * as Uptodate, it is not dirty or waiting to be committed,
345 * indicating that it was previously allocated and then modified,
346 * that there were valid bytes of data in that range of the file,
347 * and that the new data won't completely replace the old data in
348 * that range of the file.
349 */
350static int nfs_want_read_modify_write(struct file *file, struct page *page,
351 loff_t pos, unsigned len)
352{
353 unsigned int pglen = nfs_page_length(page);
354 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
355 unsigned int end = offset + len;
356
357 if ((file->f_mode & FMODE_READ) && /* open for read? */
358 !PageUptodate(page) && /* Uptodate? */
359 !PagePrivate(page) && /* i/o request already? */
360 pglen && /* valid bytes of file? */
361 (end < pglen || offset)) /* replace all valid bytes? */
362 return 1;
363 return 0;
364}
365
366/*
331 * This does the "real" work of the write. We must allocate and lock the 367 * This does the "real" work of the write. We must allocate and lock the
332 * page to be sent back to the generic routine, which then copies the 368 * page to be sent back to the generic routine, which then copies the
333 * data from user space. 369 * data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
340 struct page **pagep, void **fsdata) 376 struct page **pagep, void **fsdata)
341{ 377{
342 int ret; 378 int ret;
343 pgoff_t index; 379 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
344 struct page *page; 380 struct page *page;
345 index = pos >> PAGE_CACHE_SHIFT; 381 int once_thru = 0;
346 382
347 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", 383 dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
348 file->f_path.dentry->d_parent->d_name.name, 384 file->f_path.dentry->d_parent->d_name.name,
349 file->f_path.dentry->d_name.name, 385 file->f_path.dentry->d_name.name,
350 mapping->host->i_ino, len, (long long) pos); 386 mapping->host->i_ino, len, (long long) pos);
351 387
388start:
352 /* 389 /*
353 * Prevent starvation issues if someone is doing a consistency 390 * Prevent starvation issues if someone is doing a consistency
354 * sync-to-disk 391 * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
367 if (ret) { 404 if (ret) {
368 unlock_page(page); 405 unlock_page(page);
369 page_cache_release(page); 406 page_cache_release(page);
407 } else if (!once_thru &&
408 nfs_want_read_modify_write(file, page, pos, len)) {
409 once_thru = 1;
410 ret = nfs_readpage(file, page);
411 page_cache_release(page);
412 if (!ret)
413 goto start;
370 } 414 }
371 return ret; 415 return ret;
372} 416}
@@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = {
479 .invalidatepage = nfs_invalidate_page, 523 .invalidatepage = nfs_invalidate_page,
480 .releasepage = nfs_release_page, 524 .releasepage = nfs_release_page,
481 .direct_IO = nfs_direct_IO, 525 .direct_IO = nfs_direct_IO,
526 .migratepage = nfs_migrate_page,
482 .launder_page = nfs_launder_page, 527 .launder_page = nfs_launder_page,
483}; 528};
484 529
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
101 101
102static unsigned int fnvhash32(const void *, size_t); 102static unsigned int fnvhash32(const void *, size_t);
103 103
104static struct rpc_pipe_ops idmap_upcall_ops = { 104static const struct rpc_pipe_ops idmap_upcall_ops = {
105 .upcall = idmap_pipe_upcall, 105 .upcall = idmap_pipe_upcall,
106 .downcall = idmap_pipe_downcall, 106 .downcall = idmap_pipe_downcall,
107 .destroy_msg = idmap_pipe_destroy_msg, 107 .destroy_msg = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
119 if (idmap == NULL) 119 if (idmap == NULL)
120 return -ENOMEM; 120 return -ENOMEM;
121 121
122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", 122 idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
123 idmap, &idmap_upcall_ops, 0); 123 "idmap", idmap, &idmap_upcall_ops, 0);
124 if (IS_ERR(idmap->idmap_dentry)) { 124 if (IS_ERR(idmap->idmap_dentry)) {
125 error = PTR_ERR(idmap->idmap_dentry); 125 error = PTR_ERR(idmap->idmap_dentry);
126 kfree(idmap); 126 kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
46#include "iostat.h" 46#include "iostat.h"
47#include "internal.h" 47#include "internal.h"
48#include "fscache.h" 48#include "fscache.h"
49#include "dns_resolve.h"
49 50
50#define NFSDBG_FACILITY NFSDBG_VFS 51#define NFSDBG_FACILITY NFSDBG_VFS
51 52
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
286 /* We can't support update_atime(), since the server will reset it */ 287 /* We can't support update_atime(), since the server will reset it */
287 inode->i_flags |= S_NOATIME|S_NOCMTIME; 288 inode->i_flags |= S_NOATIME|S_NOCMTIME;
288 inode->i_mode = fattr->mode; 289 inode->i_mode = fattr->mode;
290 if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
291 && nfs_server_capable(inode, NFS_CAP_MODE))
292 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
293 | NFS_INO_INVALID_ACCESS
294 | NFS_INO_INVALID_ACL;
289 /* Why so? Because we want revalidate for devices/FIFOs, and 295 /* Why so? Because we want revalidate for devices/FIFOs, and
290 * that's precisely what we have in nfs_file_inode_operations. 296 * that's precisely what we have in nfs_file_inode_operations.
291 */ 297 */
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
330 nfsi->attr_gencount = fattr->gencount; 336 nfsi->attr_gencount = fattr->gencount;
331 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 337 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
332 inode->i_atime = fattr->atime; 338 inode->i_atime = fattr->atime;
339 else if (nfs_server_capable(inode, NFS_CAP_ATIME))
340 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
333 if (fattr->valid & NFS_ATTR_FATTR_MTIME) 341 if (fattr->valid & NFS_ATTR_FATTR_MTIME)
334 inode->i_mtime = fattr->mtime; 342 inode->i_mtime = fattr->mtime;
343 else if (nfs_server_capable(inode, NFS_CAP_MTIME))
344 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
345 | NFS_INO_INVALID_DATA;
335 if (fattr->valid & NFS_ATTR_FATTR_CTIME) 346 if (fattr->valid & NFS_ATTR_FATTR_CTIME)
336 inode->i_ctime = fattr->ctime; 347 inode->i_ctime = fattr->ctime;
348 else if (nfs_server_capable(inode, NFS_CAP_CTIME))
349 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
350 | NFS_INO_INVALID_ACCESS
351 | NFS_INO_INVALID_ACL;
337 if (fattr->valid & NFS_ATTR_FATTR_CHANGE) 352 if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
338 nfsi->change_attr = fattr->change_attr; 353 nfsi->change_attr = fattr->change_attr;
354 else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
355 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
356 | NFS_INO_INVALID_DATA;
339 if (fattr->valid & NFS_ATTR_FATTR_SIZE) 357 if (fattr->valid & NFS_ATTR_FATTR_SIZE)
340 inode->i_size = nfs_size_to_loff_t(fattr->size); 358 inode->i_size = nfs_size_to_loff_t(fattr->size);
359 else
360 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
361 | NFS_INO_INVALID_DATA
362 | NFS_INO_REVAL_PAGECACHE;
341 if (fattr->valid & NFS_ATTR_FATTR_NLINK) 363 if (fattr->valid & NFS_ATTR_FATTR_NLINK)
342 inode->i_nlink = fattr->nlink; 364 inode->i_nlink = fattr->nlink;
365 else if (nfs_server_capable(inode, NFS_CAP_NLINK))
366 nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
343 if (fattr->valid & NFS_ATTR_FATTR_OWNER) 367 if (fattr->valid & NFS_ATTR_FATTR_OWNER)
344 inode->i_uid = fattr->uid; 368 inode->i_uid = fattr->uid;
369 else if (nfs_server_capable(inode, NFS_CAP_OWNER))
370 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
371 | NFS_INO_INVALID_ACCESS
372 | NFS_INO_INVALID_ACL;
345 if (fattr->valid & NFS_ATTR_FATTR_GROUP) 373 if (fattr->valid & NFS_ATTR_FATTR_GROUP)
346 inode->i_gid = fattr->gid; 374 inode->i_gid = fattr->gid;
375 else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
376 nfsi->cache_validity |= NFS_INO_INVALID_ATTR
377 | NFS_INO_INVALID_ACCESS
378 | NFS_INO_INVALID_ACL;
347 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) 379 if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
348 inode->i_blocks = fattr->du.nfs2.blocks; 380 inode->i_blocks = fattr->du.nfs2.blocks;
349 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 381 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1145 loff_t cur_isize, new_isize; 1177 loff_t cur_isize, new_isize;
1146 unsigned long invalid = 0; 1178 unsigned long invalid = 0;
1147 unsigned long now = jiffies; 1179 unsigned long now = jiffies;
1180 unsigned long save_cache_validity;
1148 1181
1149 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", 1182 dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
1150 __func__, inode->i_sb->s_id, inode->i_ino, 1183 __func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1171 */ 1204 */
1172 nfsi->read_cache_jiffies = fattr->time_start; 1205 nfsi->read_cache_jiffies = fattr->time_start;
1173 1206
1174 if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) 1207 save_cache_validity = nfsi->cache_validity;
1175 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR 1208 nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
1176 | NFS_INO_INVALID_ATIME 1209 | NFS_INO_INVALID_ATIME
1177 | NFS_INO_REVAL_PAGECACHE); 1210 | NFS_INO_REVAL_FORCED
1211 | NFS_INO_REVAL_PAGECACHE);
1178 1212
1179 /* Do atomic weak cache consistency updates */ 1213 /* Do atomic weak cache consistency updates */
1180 nfs_wcc_update_inode(inode, fattr); 1214 nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1189 nfs_force_lookup_revalidate(inode); 1223 nfs_force_lookup_revalidate(inode);
1190 nfsi->change_attr = fattr->change_attr; 1224 nfsi->change_attr = fattr->change_attr;
1191 } 1225 }
1192 } 1226 } else if (server->caps & NFS_CAP_CHANGE_ATTR)
1227 invalid |= save_cache_validity;
1193 1228
1194 if (fattr->valid & NFS_ATTR_FATTR_MTIME) { 1229 if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
1195 /* NFSv2/v3: Check if the mtime agrees */ 1230 /* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1201 nfs_force_lookup_revalidate(inode); 1236 nfs_force_lookup_revalidate(inode);
1202 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); 1237 memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
1203 } 1238 }
1204 } 1239 } else if (server->caps & NFS_CAP_MTIME)
1240 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1241 | NFS_INO_INVALID_DATA
1242 | NFS_INO_REVAL_PAGECACHE
1243 | NFS_INO_REVAL_FORCED);
1244
1205 if (fattr->valid & NFS_ATTR_FATTR_CTIME) { 1245 if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
1206 /* If ctime has changed we should definitely clear access+acl caches */ 1246 /* If ctime has changed we should definitely clear access+acl caches */
1207 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { 1247 if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1215 } 1255 }
1216 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); 1256 memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
1217 } 1257 }
1218 } 1258 } else if (server->caps & NFS_CAP_CTIME)
1259 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1260 | NFS_INO_INVALID_ACCESS
1261 | NFS_INO_INVALID_ACL
1262 | NFS_INO_REVAL_FORCED);
1219 1263
1220 /* Check if our cached file size is stale */ 1264 /* Check if our cached file size is stale */
1221 if (fattr->valid & NFS_ATTR_FATTR_SIZE) { 1265 if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1231 dprintk("NFS: isize change on server for file %s/%ld\n", 1275 dprintk("NFS: isize change on server for file %s/%ld\n",
1232 inode->i_sb->s_id, inode->i_ino); 1276 inode->i_sb->s_id, inode->i_ino);
1233 } 1277 }
1234 } 1278 } else
1279 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1280 | NFS_INO_REVAL_PAGECACHE
1281 | NFS_INO_REVAL_FORCED);
1235 1282
1236 1283
1237 if (fattr->valid & NFS_ATTR_FATTR_ATIME) 1284 if (fattr->valid & NFS_ATTR_FATTR_ATIME)
1238 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); 1285 memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
1286 else if (server->caps & NFS_CAP_ATIME)
1287 invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
1288 | NFS_INO_REVAL_FORCED);
1239 1289
1240 if (fattr->valid & NFS_ATTR_FATTR_MODE) { 1290 if (fattr->valid & NFS_ATTR_FATTR_MODE) {
1241 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { 1291 if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
1242 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1292 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1243 inode->i_mode = fattr->mode; 1293 inode->i_mode = fattr->mode;
1244 } 1294 }
1245 } 1295 } else if (server->caps & NFS_CAP_MODE)
1296 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1297 | NFS_INO_INVALID_ACCESS
1298 | NFS_INO_INVALID_ACL
1299 | NFS_INO_REVAL_FORCED);
1300
1246 if (fattr->valid & NFS_ATTR_FATTR_OWNER) { 1301 if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
1247 if (inode->i_uid != fattr->uid) { 1302 if (inode->i_uid != fattr->uid) {
1248 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1303 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1249 inode->i_uid = fattr->uid; 1304 inode->i_uid = fattr->uid;
1250 } 1305 }
1251 } 1306 } else if (server->caps & NFS_CAP_OWNER)
1307 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1308 | NFS_INO_INVALID_ACCESS
1309 | NFS_INO_INVALID_ACL
1310 | NFS_INO_REVAL_FORCED);
1311
1252 if (fattr->valid & NFS_ATTR_FATTR_GROUP) { 1312 if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
1253 if (inode->i_gid != fattr->gid) { 1313 if (inode->i_gid != fattr->gid) {
1254 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; 1314 invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
1255 inode->i_gid = fattr->gid; 1315 inode->i_gid = fattr->gid;
1256 } 1316 }
1257 } 1317 } else if (server->caps & NFS_CAP_OWNER_GROUP)
1318 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1319 | NFS_INO_INVALID_ACCESS
1320 | NFS_INO_INVALID_ACL
1321 | NFS_INO_REVAL_FORCED);
1258 1322
1259 if (fattr->valid & NFS_ATTR_FATTR_NLINK) { 1323 if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
1260 if (inode->i_nlink != fattr->nlink) { 1324 if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1263 invalid |= NFS_INO_INVALID_DATA; 1327 invalid |= NFS_INO_INVALID_DATA;
1264 inode->i_nlink = fattr->nlink; 1328 inode->i_nlink = fattr->nlink;
1265 } 1329 }
1266 } 1330 } else if (server->caps & NFS_CAP_NLINK)
1331 invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
1332 | NFS_INO_REVAL_FORCED);
1267 1333
1268 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { 1334 if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
1269 /* 1335 /*
@@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1293 || S_ISLNK(inode->i_mode))) 1359 || S_ISLNK(inode->i_mode)))
1294 invalid &= ~NFS_INO_INVALID_DATA; 1360 invalid &= ~NFS_INO_INVALID_DATA;
1295 if (!nfs_have_delegation(inode, FMODE_READ) || 1361 if (!nfs_have_delegation(inode, FMODE_READ) ||
1296 (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) 1362 (save_cache_validity & NFS_INO_REVAL_FORCED))
1297 nfsi->cache_validity |= invalid; 1363 nfsi->cache_validity |= invalid;
1298 nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
1299 1364
1300 return 0; 1365 return 0;
1301 out_changed: 1366 out_changed:
@@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void)
1442{ 1507{
1443 int err; 1508 int err;
1444 1509
1510 err = nfs_dns_resolver_init();
1511 if (err < 0)
1512 goto out8;
1513
1445 err = nfs_fscache_register(); 1514 err = nfs_fscache_register();
1446 if (err < 0) 1515 if (err < 0)
1447 goto out7; 1516 goto out7;
@@ -1500,6 +1569,8 @@ out5:
1500out6: 1569out6:
1501 nfs_fscache_unregister(); 1570 nfs_fscache_unregister();
1502out7: 1571out7:
1572 nfs_dns_resolver_destroy();
1573out8:
1503 return err; 1574 return err;
1504} 1575}
1505 1576
@@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
1511 nfs_destroy_inodecache(); 1582 nfs_destroy_inodecache();
1512 nfs_destroy_nfspagecache(); 1583 nfs_destroy_nfspagecache();
1513 nfs_fscache_unregister(); 1584 nfs_fscache_unregister();
1585 nfs_dns_resolver_destroy();
1514#ifdef CONFIG_PROC_FS 1586#ifdef CONFIG_PROC_FS
1515 rpc_proc_unregister("nfs"); 1587 rpc_proc_unregister("nfs");
1516#endif 1588#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
49#define NFS_MAX_SECFLAVORS (12) 49#define NFS_MAX_SECFLAVORS (12)
50 50
51/* 51/*
52 * Value used if the user did not specify a port value.
53 */
54#define NFS_UNSPEC_PORT (-1)
55
56/*
52 * In-kernel mount arguments 57 * In-kernel mount arguments
53 */ 58 */
54struct nfs_parsed_mount_data { 59struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
63 unsigned int auth_flavor_len; 68 unsigned int auth_flavor_len;
64 rpc_authflavor_t auth_flavors[1]; 69 rpc_authflavor_t auth_flavors[1];
65 char *client_address; 70 char *client_address;
71 unsigned int version;
66 unsigned int minorversion; 72 unsigned int minorversion;
67 char *fscache_uniq; 73 char *fscache_uniq;
68 74
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
71 size_t addrlen; 77 size_t addrlen;
72 char *hostname; 78 char *hostname;
73 u32 version; 79 u32 version;
74 unsigned short port; 80 int port;
75 unsigned short protocol; 81 unsigned short protocol;
76 } mount_server; 82 } mount_server;
77 83
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
80 size_t addrlen; 86 size_t addrlen;
81 char *hostname; 87 char *hostname;
82 char *export_path; 88 char *export_path;
83 unsigned short port; 89 int port;
84 unsigned short protocol; 90 unsigned short protocol;
85 } nfs_server; 91 } nfs_server;
86 92
@@ -102,6 +108,7 @@ struct nfs_mount_request {
102}; 108};
103 109
104extern int nfs_mount(struct nfs_mount_request *info); 110extern int nfs_mount(struct nfs_mount_request *info);
111extern void nfs_umount(const struct nfs_mount_request *info);
105 112
106/* client.c */ 113/* client.c */
107extern struct rpc_program nfs_program; 114extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
213extern int nfs_wait_bit_killable(void *word); 220extern int nfs_wait_bit_killable(void *word);
214 221
215/* super.c */ 222/* super.c */
216void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
217extern struct file_system_type nfs_xdev_fs_type; 223extern struct file_system_type nfs_xdev_fs_type;
218#ifdef CONFIG_NFS_V4 224#ifdef CONFIG_NFS_V4
219extern struct file_system_type nfs4_xdev_fs_type; 225extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
248 254
249/* write.c */ 255/* write.c */
250extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 256extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
257#ifdef CONFIG_MIGRATION
258extern int nfs_migrate_page(struct address_space *,
259 struct page *, struct page *);
260#else
261#define nfs_migrate_page NULL
262#endif
251 263
252/* nfs4proc.c */ 264/* nfs4proc.c */
253extern int _nfs4_call_sync(struct nfs_server *server, 265extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
368 return ((unsigned long)len + (unsigned long)base + 380 return ((unsigned long)len + (unsigned long)base +
369 PAGE_SIZE - 1) >> PAGE_SHIFT; 381 PAGE_SIZE - 1) >> PAGE_SHIFT;
370} 382}
371
372#define IPV6_SCOPE_DELIMITER '%'
373
374/*
375 * Set the port number in an address. Be agnostic about the address
376 * family.
377 */
378static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
379{
380 struct sockaddr_in *ap = (struct sockaddr_in *)sap;
381 struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
382
383 switch (sap->sa_family) {
384 case AF_INET:
385 ap->sin_port = htons(port);
386 break;
387 case AF_INET6:
388 ap6->sin6_port = htons(port);
389 break;
390 }
391}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
209 goto out; 209 goto out;
210} 210}
211 211
212/**
213 * nfs_umount - Notify a server that we have unmounted this export
214 * @info: pointer to umount request arguments
215 *
216 * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
217 * use UDP.
218 */
219void nfs_umount(const struct nfs_mount_request *info)
220{
221 static const struct rpc_timeout nfs_umnt_timeout = {
222 .to_initval = 1 * HZ,
223 .to_maxval = 3 * HZ,
224 .to_retries = 2,
225 };
226 struct rpc_create_args args = {
227 .protocol = IPPROTO_UDP,
228 .address = info->sap,
229 .addrsize = info->salen,
230 .timeout = &nfs_umnt_timeout,
231 .servername = info->hostname,
232 .program = &mnt_program,
233 .version = info->version,
234 .authflavor = RPC_AUTH_UNIX,
235 .flags = RPC_CLNT_CREATE_NOPING,
236 };
237 struct mountres result;
238 struct rpc_message msg = {
239 .rpc_argp = info->dirpath,
240 .rpc_resp = &result,
241 };
242 struct rpc_clnt *clnt;
243 int status;
244
245 if (info->noresvport)
246 args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
247
248 clnt = rpc_create(&args);
249 if (unlikely(IS_ERR(clnt)))
250 goto out_clnt_err;
251
252 dprintk("NFS: sending UMNT request for %s:%s\n",
253 (info->hostname ? info->hostname : "server"), info->dirpath);
254
255 if (info->version == NFS_MNT3_VERSION)
256 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
257 else
258 msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
259
260 status = rpc_call_sync(clnt, &msg, 0);
261 rpc_shutdown_client(clnt);
262
263 if (unlikely(status < 0))
264 goto out_call_err;
265
266 return;
267
268out_clnt_err:
269 dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
270 PTR_ERR(clnt));
271 return;
272
273out_call_err:
274 dprintk("NFS: UMNT request failed, status=%d\n", status);
275}
276
212/* 277/*
213 * XDR encode/decode functions for MOUNT 278 * XDR encode/decode functions for MOUNT
214 */ 279 */
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
258 return -EIO; 323 return -EIO;
259 status = ntohl(*p); 324 status = ntohl(*p);
260 325
261 for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { 326 for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
262 if (mnt_errtbl[i].status == status) { 327 if (mnt_errtbl[i].status == status) {
263 res->errno = mnt_errtbl[i].errno; 328 res->errno = mnt_errtbl[i].errno;
264 return 0; 329 return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
309 return -EIO; 374 return -EIO;
310 status = ntohl(*p); 375 status = ntohl(*p);
311 376
312 for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { 377 for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
313 if (mnt3_errtbl[i].status == status) { 378 if (mnt3_errtbl[i].status == status) {
314 res->errno = mnt3_errtbl[i].errno; 379 res->errno = mnt3_errtbl[i].errno;
315 return 0; 380 return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
407 .p_statidx = MOUNTPROC_MNT, 472 .p_statidx = MOUNTPROC_MNT,
408 .p_name = "MOUNT", 473 .p_name = "MOUNT",
409 }, 474 },
475 [MOUNTPROC_UMNT] = {
476 .p_proc = MOUNTPROC_UMNT,
477 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
478 .p_arglen = MNT_enc_dirpath_sz,
479 .p_statidx = MOUNTPROC_UMNT,
480 .p_name = "UMOUNT",
481 },
410}; 482};
411 483
412static struct rpc_procinfo mnt3_procedures[] = { 484static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
419 .p_statidx = MOUNTPROC3_MNT, 491 .p_statidx = MOUNTPROC3_MNT,
420 .p_name = "MOUNT", 492 .p_name = "MOUNT",
421 }, 493 },
494 [MOUNTPROC3_UMNT] = {
495 .p_proc = MOUNTPROC3_UMNT,
496 .p_encode = (kxdrproc_t)mnt_enc_dirpath,
497 .p_arglen = MNT_enc_dirpath_sz,
498 .p_statidx = MOUNTPROC3_UMNT,
499 .p_name = "UMOUNT",
500 },
422}; 501};
423 502
424 503
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
299 299
300/* 300/*
301 * Create a regular file. 301 * Create a regular file.
302 * For now, we don't implement O_EXCL.
303 */ 302 */
304static int 303static int
305nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, 304nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
17#include <linux/inet.h> 17#include <linux/inet.h>
18#include "internal.h" 18#include "internal.h"
19#include "nfs4_fs.h" 19#include "nfs4_fs.h"
20#include "dns_resolve.h"
20 21
21#define NFSDBG_FACILITY NFSDBG_VFS 22#define NFSDBG_FACILITY NFSDBG_VFS
22 23
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
95 return 0; 96 return 0;
96} 97}
97 98
99static size_t nfs_parse_server_name(char *string, size_t len,
100 struct sockaddr *sa, size_t salen)
101{
102 ssize_t ret;
103
104 ret = rpc_pton(string, len, sa, salen);
105 if (ret == 0) {
106 ret = nfs_dns_resolve_name(string, len, sa, salen);
107 if (ret < 0)
108 ret = 0;
109 }
110 return ret;
111}
112
98static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, 113static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
99 char *page, char *page2, 114 char *page, char *page2,
100 const struct nfs4_fs_location *location) 115 const struct nfs4_fs_location *location)
@@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
121 136
122 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) 137 if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
123 continue; 138 continue;
124 nfs_parse_ip_address(buf->data, buf->len, 139 mountdata->addrlen = nfs_parse_server_name(buf->data,
125 mountdata->addr, &mountdata->addrlen); 140 buf->len,
126 if (mountdata->addr->sa_family == AF_UNSPEC) 141 mountdata->addr, mountdata->addrlen);
142 if (mountdata->addrlen == 0)
127 continue; 143 continue;
128 nfs_set_port(mountdata->addr, NFS_PORT); 144 rpc_set_port(mountdata->addr, NFS_PORT);
129 145
130 memcpy(page2, buf->data, buf->len); 146 memcpy(page2, buf->data, buf->len);
131 page2[buf->len] = '\0'; 147 page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
61#define NFS4_POLL_RETRY_MIN (HZ/10) 61#define NFS4_POLL_RETRY_MIN (HZ/10)
62#define NFS4_POLL_RETRY_MAX (15*HZ) 62#define NFS4_POLL_RETRY_MAX (15*HZ)
63 63
64#define NFS4_MAX_LOOP_ON_RECOVER (10)
65
64struct nfs4_opendata; 66struct nfs4_opendata;
65static int _nfs4_proc_open(struct nfs4_opendata *data); 67static int _nfs4_proc_open(struct nfs4_opendata *data);
66static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 68static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
426static int nfs4_recover_session(struct nfs4_session *session) 428static int nfs4_recover_session(struct nfs4_session *session)
427{ 429{
428 struct nfs_client *clp = session->clp; 430 struct nfs_client *clp = session->clp;
431 unsigned int loop;
429 int ret; 432 int ret;
430 433
431 for (;;) { 434 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
432 ret = nfs4_wait_clnt_recover(clp); 435 ret = nfs4_wait_clnt_recover(clp);
433 if (ret != 0) 436 if (ret != 0)
434 return ret; 437 break;
435 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) 438 if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
436 break; 439 break;
437 nfs4_schedule_state_manager(clp); 440 nfs4_schedule_state_manager(clp);
441 ret = -EIO;
438 } 442 }
439 return 0; 443 return ret;
440} 444}
441 445
442static int nfs41_setup_sequence(struct nfs4_session *session, 446static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1444static int nfs4_recover_expired_lease(struct nfs_server *server) 1448static int nfs4_recover_expired_lease(struct nfs_server *server)
1445{ 1449{
1446 struct nfs_client *clp = server->nfs_client; 1450 struct nfs_client *clp = server->nfs_client;
1451 unsigned int loop;
1447 int ret; 1452 int ret;
1448 1453
1449 for (;;) { 1454 for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
1450 ret = nfs4_wait_clnt_recover(clp); 1455 ret = nfs4_wait_clnt_recover(clp);
1451 if (ret != 0) 1456 if (ret != 0)
1452 return ret; 1457 break;
1453 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && 1458 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1454 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) 1459 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1455 break; 1460 break;
1456 nfs4_schedule_state_recovery(clp); 1461 nfs4_schedule_state_recovery(clp);
1462 ret = -EIO;
1457 } 1463 }
1458 return 0; 1464 return ret;
1459} 1465}
1460 1466
1461/* 1467/*
@@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
1997 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2003 status = nfs4_call_sync(server, &msg, &args, &res, 0);
1998 if (status == 0) { 2004 if (status == 0) {
1999 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 2005 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
2006 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
2007 NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
2008 NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
2009 NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
2010 NFS_CAP_CTIME|NFS_CAP_MTIME);
2000 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) 2011 if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
2001 server->caps |= NFS_CAP_ACLS; 2012 server->caps |= NFS_CAP_ACLS;
2002 if (res.has_links != 0) 2013 if (res.has_links != 0)
2003 server->caps |= NFS_CAP_HARDLINKS; 2014 server->caps |= NFS_CAP_HARDLINKS;
2004 if (res.has_symlinks != 0) 2015 if (res.has_symlinks != 0)
2005 server->caps |= NFS_CAP_SYMLINKS; 2016 server->caps |= NFS_CAP_SYMLINKS;
2017 if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
2018 server->caps |= NFS_CAP_FILEID;
2019 if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
2020 server->caps |= NFS_CAP_MODE;
2021 if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
2022 server->caps |= NFS_CAP_NLINK;
2023 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
2024 server->caps |= NFS_CAP_OWNER;
2025 if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
2026 server->caps |= NFS_CAP_OWNER_GROUP;
2027 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
2028 server->caps |= NFS_CAP_ATIME;
2029 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
2030 server->caps |= NFS_CAP_CTIME;
2031 if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
2032 server->caps |= NFS_CAP_MTIME;
2033
2006 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); 2034 memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
2007 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; 2035 server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
2008 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; 2036 server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..cfc30d362f94 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,29 +702,12 @@ struct compound_hdr {
702 u32 minorversion; 702 u32 minorversion;
703}; 703};
704 704
705/* 705static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
706 * START OF "GENERIC" ENCODE ROUTINES. 706{
707 * These may look a little ugly since they are imported from a "generic" 707 __be32 *p = xdr_reserve_space(xdr, nbytes);
708 * set of XDR encode/decode routines which are intended to be shared by 708 BUG_ON(!p);
709 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 709 return p;
710 * 710}
711 * If the pain of reading these is too great, it should be a straightforward
712 * task to translate them into Linux-specific versions which are more
713 * consistent with the style used in NFSv2/v3...
714 */
715#define WRITE32(n) *p++ = htonl(n)
716#define WRITE64(n) do { \
717 *p++ = htonl((uint32_t)((n) >> 32)); \
718 *p++ = htonl((uint32_t)(n)); \
719} while (0)
720#define WRITEMEM(ptr,nbytes) do { \
721 p = xdr_encode_opaque_fixed(p, ptr, nbytes); \
722} while (0)
723
724#define RESERVE_SPACE(nbytes) do { \
725 p = xdr_reserve_space(xdr, nbytes); \
726 BUG_ON(!p); \
727} while (0)
728 711
729static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) 712static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
730{ 713{
@@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
749 732
750 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); 733 dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
751 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); 734 BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
752 RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); 735 p = reserve_space(xdr, 4 + hdr->taglen + 8);
753 WRITE32(hdr->taglen); 736 p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
754 WRITEMEM(hdr->tag, hdr->taglen); 737 *p++ = cpu_to_be32(hdr->minorversion);
755 WRITE32(hdr->minorversion);
756 hdr->nops_p = p; 738 hdr->nops_p = p;
757 WRITE32(hdr->nops); 739 *p = cpu_to_be32(hdr->nops);
758} 740}
759 741
760static void encode_nops(struct compound_hdr *hdr) 742static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
829 len += 16; 811 len += 16;
830 else if (iap->ia_valid & ATTR_MTIME) 812 else if (iap->ia_valid & ATTR_MTIME)
831 len += 4; 813 len += 4;
832 RESERVE_SPACE(len); 814 p = reserve_space(xdr, len);
833 815
834 /* 816 /*
835 * We write the bitmap length now, but leave the bitmap and the attribute 817 * We write the bitmap length now, but leave the bitmap and the attribute
836 * buffer length to be backfilled at the end of this routine. 818 * buffer length to be backfilled at the end of this routine.
837 */ 819 */
838 WRITE32(2); 820 *p++ = cpu_to_be32(2);
839 q = p; 821 q = p;
840 p += 3; 822 p += 3;
841 823
842 if (iap->ia_valid & ATTR_SIZE) { 824 if (iap->ia_valid & ATTR_SIZE) {
843 bmval0 |= FATTR4_WORD0_SIZE; 825 bmval0 |= FATTR4_WORD0_SIZE;
844 WRITE64(iap->ia_size); 826 p = xdr_encode_hyper(p, iap->ia_size);
845 } 827 }
846 if (iap->ia_valid & ATTR_MODE) { 828 if (iap->ia_valid & ATTR_MODE) {
847 bmval1 |= FATTR4_WORD1_MODE; 829 bmval1 |= FATTR4_WORD1_MODE;
848 WRITE32(iap->ia_mode & S_IALLUGO); 830 *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
849 } 831 }
850 if (iap->ia_valid & ATTR_UID) { 832 if (iap->ia_valid & ATTR_UID) {
851 bmval1 |= FATTR4_WORD1_OWNER; 833 bmval1 |= FATTR4_WORD1_OWNER;
852 WRITE32(owner_namelen); 834 p = xdr_encode_opaque(p, owner_name, owner_namelen);
853 WRITEMEM(owner_name, owner_namelen);
854 } 835 }
855 if (iap->ia_valid & ATTR_GID) { 836 if (iap->ia_valid & ATTR_GID) {
856 bmval1 |= FATTR4_WORD1_OWNER_GROUP; 837 bmval1 |= FATTR4_WORD1_OWNER_GROUP;
857 WRITE32(owner_grouplen); 838 p = xdr_encode_opaque(p, owner_group, owner_grouplen);
858 WRITEMEM(owner_group, owner_grouplen);
859 } 839 }
860 if (iap->ia_valid & ATTR_ATIME_SET) { 840 if (iap->ia_valid & ATTR_ATIME_SET) {
861 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 841 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
862 WRITE32(NFS4_SET_TO_CLIENT_TIME); 842 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
863 WRITE32(0); 843 *p++ = cpu_to_be32(0);
864 WRITE32(iap->ia_mtime.tv_sec); 844 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
865 WRITE32(iap->ia_mtime.tv_nsec); 845 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
866 } 846 }
867 else if (iap->ia_valid & ATTR_ATIME) { 847 else if (iap->ia_valid & ATTR_ATIME) {
868 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; 848 bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
869 WRITE32(NFS4_SET_TO_SERVER_TIME); 849 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
870 } 850 }
871 if (iap->ia_valid & ATTR_MTIME_SET) { 851 if (iap->ia_valid & ATTR_MTIME_SET) {
872 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 852 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
873 WRITE32(NFS4_SET_TO_CLIENT_TIME); 853 *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
874 WRITE32(0); 854 *p++ = cpu_to_be32(0);
875 WRITE32(iap->ia_mtime.tv_sec); 855 *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
876 WRITE32(iap->ia_mtime.tv_nsec); 856 *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
877 } 857 }
878 else if (iap->ia_valid & ATTR_MTIME) { 858 else if (iap->ia_valid & ATTR_MTIME) {
879 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; 859 bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
880 WRITE32(NFS4_SET_TO_SERVER_TIME); 860 *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
881 } 861 }
882 862
883 /* 863 /*
@@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
891 len = (char *)p - (char *)q - 12; 871 len = (char *)p - (char *)q - 12;
892 *q++ = htonl(bmval0); 872 *q++ = htonl(bmval0);
893 *q++ = htonl(bmval1); 873 *q++ = htonl(bmval1);
894 *q++ = htonl(len); 874 *q = htonl(len);
895 875
896/* out: */ 876/* out: */
897} 877}
@@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
900{ 880{
901 __be32 *p; 881 __be32 *p;
902 882
903 RESERVE_SPACE(8); 883 p = reserve_space(xdr, 8);
904 WRITE32(OP_ACCESS); 884 *p++ = cpu_to_be32(OP_ACCESS);
905 WRITE32(access); 885 *p = cpu_to_be32(access);
906 hdr->nops++; 886 hdr->nops++;
907 hdr->replen += decode_access_maxsz; 887 hdr->replen += decode_access_maxsz;
908} 888}
@@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
911{ 891{
912 __be32 *p; 892 __be32 *p;
913 893
914 RESERVE_SPACE(8+NFS4_STATEID_SIZE); 894 p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
915 WRITE32(OP_CLOSE); 895 *p++ = cpu_to_be32(OP_CLOSE);
916 WRITE32(arg->seqid->sequence->counter); 896 *p++ = cpu_to_be32(arg->seqid->sequence->counter);
917 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 897 xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
918 hdr->nops++; 898 hdr->nops++;
919 hdr->replen += decode_close_maxsz; 899 hdr->replen += decode_close_maxsz;
920} 900}
@@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
923{ 903{
924 __be32 *p; 904 __be32 *p;
925 905
926 RESERVE_SPACE(16); 906 p = reserve_space(xdr, 16);
927 WRITE32(OP_COMMIT); 907 *p++ = cpu_to_be32(OP_COMMIT);
928 WRITE64(args->offset); 908 p = xdr_encode_hyper(p, args->offset);
929 WRITE32(args->count); 909 *p = cpu_to_be32(args->count);
930 hdr->nops++; 910 hdr->nops++;
931 hdr->replen += decode_commit_maxsz; 911 hdr->replen += decode_commit_maxsz;
932} 912}
@@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
935{ 915{
936 __be32 *p; 916 __be32 *p;
937 917
938 RESERVE_SPACE(8); 918 p = reserve_space(xdr, 8);
939 WRITE32(OP_CREATE); 919 *p++ = cpu_to_be32(OP_CREATE);
940 WRITE32(create->ftype); 920 *p = cpu_to_be32(create->ftype);
941 921
942 switch (create->ftype) { 922 switch (create->ftype) {
943 case NF4LNK: 923 case NF4LNK:
944 RESERVE_SPACE(4); 924 p = reserve_space(xdr, 4);
945 WRITE32(create->u.symlink.len); 925 *p = cpu_to_be32(create->u.symlink.len);
946 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); 926 xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
947 break; 927 break;
948 928
949 case NF4BLK: case NF4CHR: 929 case NF4BLK: case NF4CHR:
950 RESERVE_SPACE(8); 930 p = reserve_space(xdr, 8);
951 WRITE32(create->u.device.specdata1); 931 *p++ = cpu_to_be32(create->u.device.specdata1);
952 WRITE32(create->u.device.specdata2); 932 *p = cpu_to_be32(create->u.device.specdata2);
953 break; 933 break;
954 934
955 default: 935 default:
956 break; 936 break;
957 } 937 }
958 938
959 RESERVE_SPACE(4 + create->name->len); 939 encode_string(xdr, create->name->len, create->name->name);
960 WRITE32(create->name->len);
961 WRITEMEM(create->name->name, create->name->len);
962 hdr->nops++; 940 hdr->nops++;
963 hdr->replen += decode_create_maxsz; 941 hdr->replen += decode_create_maxsz;
964 942
@@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
969{ 947{
970 __be32 *p; 948 __be32 *p;
971 949
972 RESERVE_SPACE(12); 950 p = reserve_space(xdr, 12);
973 WRITE32(OP_GETATTR); 951 *p++ = cpu_to_be32(OP_GETATTR);
974 WRITE32(1); 952 *p++ = cpu_to_be32(1);
975 WRITE32(bitmap); 953 *p = cpu_to_be32(bitmap);
976 hdr->nops++; 954 hdr->nops++;
977 hdr->replen += decode_getattr_maxsz; 955 hdr->replen += decode_getattr_maxsz;
978} 956}
@@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
981{ 959{
982 __be32 *p; 960 __be32 *p;
983 961
984 RESERVE_SPACE(16); 962 p = reserve_space(xdr, 16);
985 WRITE32(OP_GETATTR); 963 *p++ = cpu_to_be32(OP_GETATTR);
986 WRITE32(2); 964 *p++ = cpu_to_be32(2);
987 WRITE32(bm0); 965 *p++ = cpu_to_be32(bm0);
988 WRITE32(bm1); 966 *p = cpu_to_be32(bm1);
989 hdr->nops++; 967 hdr->nops++;
990 hdr->replen += decode_getattr_maxsz; 968 hdr->replen += decode_getattr_maxsz;
991} 969}
@@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1012{ 990{
1013 __be32 *p; 991 __be32 *p;
1014 992
1015 RESERVE_SPACE(4); 993 p = reserve_space(xdr, 4);
1016 WRITE32(OP_GETFH); 994 *p = cpu_to_be32(OP_GETFH);
1017 hdr->nops++; 995 hdr->nops++;
1018 hdr->replen += decode_getfh_maxsz; 996 hdr->replen += decode_getfh_maxsz;
1019} 997}
@@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
1022{ 1000{
1023 __be32 *p; 1001 __be32 *p;
1024 1002
1025 RESERVE_SPACE(8 + name->len); 1003 p = reserve_space(xdr, 8 + name->len);
1026 WRITE32(OP_LINK); 1004 *p++ = cpu_to_be32(OP_LINK);
1027 WRITE32(name->len); 1005 xdr_encode_opaque(p, name->name, name->len);
1028 WRITEMEM(name->name, name->len);
1029 hdr->nops++; 1006 hdr->nops++;
1030 hdr->replen += decode_link_maxsz; 1007 hdr->replen += decode_link_maxsz;
1031} 1008}
@@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
1052{ 1029{
1053 __be32 *p; 1030 __be32 *p;
1054 1031
1055 RESERVE_SPACE(32); 1032 p = reserve_space(xdr, 32);
1056 WRITE32(OP_LOCK); 1033 *p++ = cpu_to_be32(OP_LOCK);
1057 WRITE32(nfs4_lock_type(args->fl, args->block)); 1034 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
1058 WRITE32(args->reclaim); 1035 *p++ = cpu_to_be32(args->reclaim);
1059 WRITE64(args->fl->fl_start); 1036 p = xdr_encode_hyper(p, args->fl->fl_start);
1060 WRITE64(nfs4_lock_length(args->fl)); 1037 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1061 WRITE32(args->new_lock_owner); 1038 *p = cpu_to_be32(args->new_lock_owner);
1062 if (args->new_lock_owner){ 1039 if (args->new_lock_owner){
1063 RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); 1040 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
1064 WRITE32(args->open_seqid->sequence->counter); 1041 *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
1065 WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); 1042 p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
1066 WRITE32(args->lock_seqid->sequence->counter); 1043 *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
1067 WRITE64(args->lock_owner.clientid); 1044 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1068 WRITE32(16); 1045 *p++ = cpu_to_be32(16);
1069 WRITEMEM("lock id:", 8); 1046 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1070 WRITE64(args->lock_owner.id); 1047 xdr_encode_hyper(p, args->lock_owner.id);
1071 } 1048 }
1072 else { 1049 else {
1073 RESERVE_SPACE(NFS4_STATEID_SIZE+4); 1050 p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
1074 WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); 1051 p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
1075 WRITE32(args->lock_seqid->sequence->counter); 1052 *p = cpu_to_be32(args->lock_seqid->sequence->counter);
1076 } 1053 }
1077 hdr->nops++; 1054 hdr->nops++;
1078 hdr->replen += decode_lock_maxsz; 1055 hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
1082{ 1059{
1083 __be32 *p; 1060 __be32 *p;
1084 1061
1085 RESERVE_SPACE(52); 1062 p = reserve_space(xdr, 52);
1086 WRITE32(OP_LOCKT); 1063 *p++ = cpu_to_be32(OP_LOCKT);
1087 WRITE32(nfs4_lock_type(args->fl, 0)); 1064 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1088 WRITE64(args->fl->fl_start); 1065 p = xdr_encode_hyper(p, args->fl->fl_start);
1089 WRITE64(nfs4_lock_length(args->fl)); 1066 p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1090 WRITE64(args->lock_owner.clientid); 1067 p = xdr_encode_hyper(p, args->lock_owner.clientid);
1091 WRITE32(16); 1068 *p++ = cpu_to_be32(16);
1092 WRITEMEM("lock id:", 8); 1069 p = xdr_encode_opaque_fixed(p, "lock id:", 8);
1093 WRITE64(args->lock_owner.id); 1070 xdr_encode_hyper(p, args->lock_owner.id);
1094 hdr->nops++; 1071 hdr->nops++;
1095 hdr->replen += decode_lockt_maxsz; 1072 hdr->replen += decode_lockt_maxsz;
1096} 1073}
@@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
1099{ 1076{
1100 __be32 *p; 1077 __be32 *p;
1101 1078
1102 RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); 1079 p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
1103 WRITE32(OP_LOCKU); 1080 *p++ = cpu_to_be32(OP_LOCKU);
1104 WRITE32(nfs4_lock_type(args->fl, 0)); 1081 *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
1105 WRITE32(args->seqid->sequence->counter); 1082 *p++ = cpu_to_be32(args->seqid->sequence->counter);
1106 WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); 1083 p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
1107 WRITE64(args->fl->fl_start); 1084 p = xdr_encode_hyper(p, args->fl->fl_start);
1108 WRITE64(nfs4_lock_length(args->fl)); 1085 xdr_encode_hyper(p, nfs4_lock_length(args->fl));
1109 hdr->nops++; 1086 hdr->nops++;
1110 hdr->replen += decode_locku_maxsz; 1087 hdr->replen += decode_locku_maxsz;
1111} 1088}
@@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
1115 int len = name->len; 1092 int len = name->len;
1116 __be32 *p; 1093 __be32 *p;
1117 1094
1118 RESERVE_SPACE(8 + len); 1095 p = reserve_space(xdr, 8 + len);
1119 WRITE32(OP_LOOKUP); 1096 *p++ = cpu_to_be32(OP_LOOKUP);
1120 WRITE32(len); 1097 xdr_encode_opaque(p, name->name, len);
1121 WRITEMEM(name->name, len);
1122 hdr->nops++; 1098 hdr->nops++;
1123 hdr->replen += decode_lookup_maxsz; 1099 hdr->replen += decode_lookup_maxsz;
1124} 1100}
@@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
1127{ 1103{
1128 __be32 *p; 1104 __be32 *p;
1129 1105
1130 RESERVE_SPACE(8); 1106 p = reserve_space(xdr, 8);
1131 switch (fmode & (FMODE_READ|FMODE_WRITE)) { 1107 switch (fmode & (FMODE_READ|FMODE_WRITE)) {
1132 case FMODE_READ: 1108 case FMODE_READ:
1133 WRITE32(NFS4_SHARE_ACCESS_READ); 1109 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
1134 break; 1110 break;
1135 case FMODE_WRITE: 1111 case FMODE_WRITE:
1136 WRITE32(NFS4_SHARE_ACCESS_WRITE); 1112 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
1137 break; 1113 break;
1138 case FMODE_READ|FMODE_WRITE: 1114 case FMODE_READ|FMODE_WRITE:
1139 WRITE32(NFS4_SHARE_ACCESS_BOTH); 1115 *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
1140 break; 1116 break;
1141 default: 1117 default:
1142 WRITE32(0); 1118 *p++ = cpu_to_be32(0);
1143 } 1119 }
1144 WRITE32(0); /* for linux, share_deny = 0 always */ 1120 *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
1145} 1121}
1146 1122
1147static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1123static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
1151 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, 1127 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
1152 * owner 4 = 32 1128 * owner 4 = 32
1153 */ 1129 */
1154 RESERVE_SPACE(8); 1130 p = reserve_space(xdr, 8);
1155 WRITE32(OP_OPEN); 1131 *p++ = cpu_to_be32(OP_OPEN);
1156 WRITE32(arg->seqid->sequence->counter); 1132 *p = cpu_to_be32(arg->seqid->sequence->counter);
1157 encode_share_access(xdr, arg->fmode); 1133 encode_share_access(xdr, arg->fmode);
1158 RESERVE_SPACE(28); 1134 p = reserve_space(xdr, 28);
1159 WRITE64(arg->clientid); 1135 p = xdr_encode_hyper(p, arg->clientid);
1160 WRITE32(16); 1136 *p++ = cpu_to_be32(16);
1161 WRITEMEM("open id:", 8); 1137 p = xdr_encode_opaque_fixed(p, "open id:", 8);
1162 WRITE64(arg->id); 1138 xdr_encode_hyper(p, arg->id);
1163} 1139}
1164 1140
1165static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) 1141static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
1166{ 1142{
1167 __be32 *p; 1143 __be32 *p;
1168 1144
1169 RESERVE_SPACE(4); 1145 p = reserve_space(xdr, 4);
1170 switch(arg->open_flags & O_EXCL) { 1146 switch(arg->open_flags & O_EXCL) {
1171 case 0: 1147 case 0:
1172 WRITE32(NFS4_CREATE_UNCHECKED); 1148 *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
1173 encode_attrs(xdr, arg->u.attrs, arg->server); 1149 encode_attrs(xdr, arg->u.attrs, arg->server);
1174 break; 1150 break;
1175 default: 1151 default:
1176 WRITE32(NFS4_CREATE_EXCLUSIVE); 1152 *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
1177 encode_nfs4_verifier(xdr, &arg->u.verifier); 1153 encode_nfs4_verifier(xdr, &arg->u.verifier);
1178 } 1154 }
1179} 1155}
@@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
1182{ 1158{
1183 __be32 *p; 1159 __be32 *p;
1184 1160
1185 RESERVE_SPACE(4); 1161 p = reserve_space(xdr, 4);
1186 switch (arg->open_flags & O_CREAT) { 1162 switch (arg->open_flags & O_CREAT) {
1187 case 0: 1163 case 0:
1188 WRITE32(NFS4_OPEN_NOCREATE); 1164 *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
1189 break; 1165 break;
1190 default: 1166 default:
1191 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); 1167 BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
1192 WRITE32(NFS4_OPEN_CREATE); 1168 *p = cpu_to_be32(NFS4_OPEN_CREATE);
1193 encode_createmode(xdr, arg); 1169 encode_createmode(xdr, arg);
1194 } 1170 }
1195} 1171}
@@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
1198{ 1174{
1199 __be32 *p; 1175 __be32 *p;
1200 1176
1201 RESERVE_SPACE(4); 1177 p = reserve_space(xdr, 4);
1202 switch (delegation_type) { 1178 switch (delegation_type) {
1203 case 0: 1179 case 0:
1204 WRITE32(NFS4_OPEN_DELEGATE_NONE); 1180 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
1205 break; 1181 break;
1206 case FMODE_READ: 1182 case FMODE_READ:
1207 WRITE32(NFS4_OPEN_DELEGATE_READ); 1183 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
1208 break; 1184 break;
1209 case FMODE_WRITE|FMODE_READ: 1185 case FMODE_WRITE|FMODE_READ:
1210 WRITE32(NFS4_OPEN_DELEGATE_WRITE); 1186 *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
1211 break; 1187 break;
1212 default: 1188 default:
1213 BUG(); 1189 BUG();
@@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
1218{ 1194{
1219 __be32 *p; 1195 __be32 *p;
1220 1196
1221 RESERVE_SPACE(4); 1197 p = reserve_space(xdr, 4);
1222 WRITE32(NFS4_OPEN_CLAIM_NULL); 1198 *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
1223 encode_string(xdr, name->len, name->name); 1199 encode_string(xdr, name->len, name->name);
1224} 1200}
1225 1201
@@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
1227{ 1203{
1228 __be32 *p; 1204 __be32 *p;
1229 1205
1230 RESERVE_SPACE(4); 1206 p = reserve_space(xdr, 4);
1231 WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); 1207 *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
1232 encode_delegation_type(xdr, type); 1208 encode_delegation_type(xdr, type);
1233} 1209}
1234 1210
@@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
1236{ 1212{
1237 __be32 *p; 1213 __be32 *p;
1238 1214
1239 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1215 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1240 WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); 1216 *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
1241 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1217 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1242 encode_string(xdr, name->len, name->name); 1218 encode_string(xdr, name->len, name->name);
1243} 1219}
1244 1220
@@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
1267{ 1243{
1268 __be32 *p; 1244 __be32 *p;
1269 1245
1270 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1246 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1271 WRITE32(OP_OPEN_CONFIRM); 1247 *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
1272 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1248 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1273 WRITE32(arg->seqid->sequence->counter); 1249 *p = cpu_to_be32(arg->seqid->sequence->counter);
1274 hdr->nops++; 1250 hdr->nops++;
1275 hdr->replen += decode_open_confirm_maxsz; 1251 hdr->replen += decode_open_confirm_maxsz;
1276} 1252}
@@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
1279{ 1255{
1280 __be32 *p; 1256 __be32 *p;
1281 1257
1282 RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); 1258 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
1283 WRITE32(OP_OPEN_DOWNGRADE); 1259 *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
1284 WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); 1260 p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
1285 WRITE32(arg->seqid->sequence->counter); 1261 *p = cpu_to_be32(arg->seqid->sequence->counter);
1286 encode_share_access(xdr, arg->fmode); 1262 encode_share_access(xdr, arg->fmode);
1287 hdr->nops++; 1263 hdr->nops++;
1288 hdr->replen += decode_open_downgrade_maxsz; 1264 hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
1294 int len = fh->size; 1270 int len = fh->size;
1295 __be32 *p; 1271 __be32 *p;
1296 1272
1297 RESERVE_SPACE(8 + len); 1273 p = reserve_space(xdr, 8 + len);
1298 WRITE32(OP_PUTFH); 1274 *p++ = cpu_to_be32(OP_PUTFH);
1299 WRITE32(len); 1275 xdr_encode_opaque(p, fh->data, len);
1300 WRITEMEM(fh->data, len);
1301 hdr->nops++; 1276 hdr->nops++;
1302 hdr->replen += decode_putfh_maxsz; 1277 hdr->replen += decode_putfh_maxsz;
1303} 1278}
@@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1306{ 1281{
1307 __be32 *p; 1282 __be32 *p;
1308 1283
1309 RESERVE_SPACE(4); 1284 p = reserve_space(xdr, 4);
1310 WRITE32(OP_PUTROOTFH); 1285 *p = cpu_to_be32(OP_PUTROOTFH);
1311 hdr->nops++; 1286 hdr->nops++;
1312 hdr->replen += decode_putrootfh_maxsz; 1287 hdr->replen += decode_putrootfh_maxsz;
1313} 1288}
@@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1317 nfs4_stateid stateid; 1292 nfs4_stateid stateid;
1318 __be32 *p; 1293 __be32 *p;
1319 1294
1320 RESERVE_SPACE(NFS4_STATEID_SIZE); 1295 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1321 if (ctx->state != NULL) { 1296 if (ctx->state != NULL) {
1322 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); 1297 nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
1323 WRITEMEM(stateid.data, NFS4_STATEID_SIZE); 1298 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1324 } else 1299 } else
1325 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1300 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1326} 1301}
1327 1302
1328static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) 1303static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
1329{ 1304{
1330 __be32 *p; 1305 __be32 *p;
1331 1306
1332 RESERVE_SPACE(4); 1307 p = reserve_space(xdr, 4);
1333 WRITE32(OP_READ); 1308 *p = cpu_to_be32(OP_READ);
1334 1309
1335 encode_stateid(xdr, args->context); 1310 encode_stateid(xdr, args->context);
1336 1311
1337 RESERVE_SPACE(12); 1312 p = reserve_space(xdr, 12);
1338 WRITE64(args->offset); 1313 p = xdr_encode_hyper(p, args->offset);
1339 WRITE32(args->count); 1314 *p = cpu_to_be32(args->count);
1340 hdr->nops++; 1315 hdr->nops++;
1341 hdr->replen += decode_read_maxsz; 1316 hdr->replen += decode_read_maxsz;
1342} 1317}
@@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
1349 }; 1324 };
1350 __be32 *p; 1325 __be32 *p;
1351 1326
1352 RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); 1327 p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
1353 WRITE32(OP_READDIR); 1328 *p++ = cpu_to_be32(OP_READDIR);
1354 WRITE64(readdir->cookie); 1329 p = xdr_encode_hyper(p, readdir->cookie);
1355 WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); 1330 p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
1356 WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ 1331 *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */
1357 WRITE32(readdir->count); 1332 *p++ = cpu_to_be32(readdir->count);
1358 WRITE32(2); 1333 *p++ = cpu_to_be32(2);
1359 /* Switch to mounted_on_fileid if the server supports it */ 1334 /* Switch to mounted_on_fileid if the server supports it */
1360 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) 1335 if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
1361 attrs[0] &= ~FATTR4_WORD0_FILEID; 1336 attrs[0] &= ~FATTR4_WORD0_FILEID;
1362 else 1337 else
1363 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 1338 attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
1364 WRITE32(attrs[0] & readdir->bitmask[0]); 1339 *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
1365 WRITE32(attrs[1] & readdir->bitmask[1]); 1340 *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
1366 hdr->nops++; 1341 hdr->nops++;
1367 hdr->replen += decode_readdir_maxsz; 1342 hdr->replen += decode_readdir_maxsz;
1368 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", 1343 dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
1378{ 1353{
1379 __be32 *p; 1354 __be32 *p;
1380 1355
1381 RESERVE_SPACE(4); 1356 p = reserve_space(xdr, 4);
1382 WRITE32(OP_READLINK); 1357 *p = cpu_to_be32(OP_READLINK);
1383 hdr->nops++; 1358 hdr->nops++;
1384 hdr->replen += decode_readlink_maxsz; 1359 hdr->replen += decode_readlink_maxsz;
1385} 1360}
@@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
1388{ 1363{
1389 __be32 *p; 1364 __be32 *p;
1390 1365
1391 RESERVE_SPACE(8 + name->len); 1366 p = reserve_space(xdr, 8 + name->len);
1392 WRITE32(OP_REMOVE); 1367 *p++ = cpu_to_be32(OP_REMOVE);
1393 WRITE32(name->len); 1368 xdr_encode_opaque(p, name->name, name->len);
1394 WRITEMEM(name->name, name->len);
1395 hdr->nops++; 1369 hdr->nops++;
1396 hdr->replen += decode_remove_maxsz; 1370 hdr->replen += decode_remove_maxsz;
1397} 1371}
@@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
1400{ 1374{
1401 __be32 *p; 1375 __be32 *p;
1402 1376
1403 RESERVE_SPACE(8 + oldname->len); 1377 p = reserve_space(xdr, 4);
1404 WRITE32(OP_RENAME); 1378 *p = cpu_to_be32(OP_RENAME);
1405 WRITE32(oldname->len); 1379 encode_string(xdr, oldname->len, oldname->name);
1406 WRITEMEM(oldname->name, oldname->len); 1380 encode_string(xdr, newname->len, newname->name);
1407
1408 RESERVE_SPACE(4 + newname->len);
1409 WRITE32(newname->len);
1410 WRITEMEM(newname->name, newname->len);
1411 hdr->nops++; 1381 hdr->nops++;
1412 hdr->replen += decode_rename_maxsz; 1382 hdr->replen += decode_rename_maxsz;
1413} 1383}
@@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
1416{ 1386{
1417 __be32 *p; 1387 __be32 *p;
1418 1388
1419 RESERVE_SPACE(12); 1389 p = reserve_space(xdr, 12);
1420 WRITE32(OP_RENEW); 1390 *p++ = cpu_to_be32(OP_RENEW);
1421 WRITE64(client_stateid->cl_clientid); 1391 xdr_encode_hyper(p, client_stateid->cl_clientid);
1422 hdr->nops++; 1392 hdr->nops++;
1423 hdr->replen += decode_renew_maxsz; 1393 hdr->replen += decode_renew_maxsz;
1424} 1394}
@@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1428{ 1398{
1429 __be32 *p; 1399 __be32 *p;
1430 1400
1431 RESERVE_SPACE(4); 1401 p = reserve_space(xdr, 4);
1432 WRITE32(OP_RESTOREFH); 1402 *p = cpu_to_be32(OP_RESTOREFH);
1433 hdr->nops++; 1403 hdr->nops++;
1434 hdr->replen += decode_restorefh_maxsz; 1404 hdr->replen += decode_restorefh_maxsz;
1435} 1405}
@@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
1439{ 1409{
1440 __be32 *p; 1410 __be32 *p;
1441 1411
1442 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1412 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1443 WRITE32(OP_SETATTR); 1413 *p++ = cpu_to_be32(OP_SETATTR);
1444 WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); 1414 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
1445 RESERVE_SPACE(2*4); 1415 p = reserve_space(xdr, 2*4);
1446 WRITE32(1); 1416 *p++ = cpu_to_be32(1);
1447 WRITE32(FATTR4_WORD0_ACL); 1417 *p = cpu_to_be32(FATTR4_WORD0_ACL);
1448 if (arg->acl_len % 4) 1418 if (arg->acl_len % 4)
1449 return -EINVAL; 1419 return -EINVAL;
1450 RESERVE_SPACE(4); 1420 p = reserve_space(xdr, 4);
1451 WRITE32(arg->acl_len); 1421 *p = cpu_to_be32(arg->acl_len);
1452 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); 1422 xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
1453 hdr->nops++; 1423 hdr->nops++;
1454 hdr->replen += decode_setacl_maxsz; 1424 hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1460{ 1430{
1461 __be32 *p; 1431 __be32 *p;
1462 1432
1463 RESERVE_SPACE(4); 1433 p = reserve_space(xdr, 4);
1464 WRITE32(OP_SAVEFH); 1434 *p = cpu_to_be32(OP_SAVEFH);
1465 hdr->nops++; 1435 hdr->nops++;
1466 hdr->replen += decode_savefh_maxsz; 1436 hdr->replen += decode_savefh_maxsz;
1467} 1437}
@@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
1470{ 1440{
1471 __be32 *p; 1441 __be32 *p;
1472 1442
1473 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1443 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1474 WRITE32(OP_SETATTR); 1444 *p++ = cpu_to_be32(OP_SETATTR);
1475 WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); 1445 xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
1476 hdr->nops++; 1446 hdr->nops++;
1477 hdr->replen += decode_setattr_maxsz; 1447 hdr->replen += decode_setattr_maxsz;
1478 encode_attrs(xdr, arg->iap, server); 1448 encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
1482{ 1452{
1483 __be32 *p; 1453 __be32 *p;
1484 1454
1485 RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); 1455 p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
1486 WRITE32(OP_SETCLIENTID); 1456 *p++ = cpu_to_be32(OP_SETCLIENTID);
1487 WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); 1457 xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
1488 1458
1489 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); 1459 encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
1490 RESERVE_SPACE(4); 1460 p = reserve_space(xdr, 4);
1491 WRITE32(setclientid->sc_prog); 1461 *p = cpu_to_be32(setclientid->sc_prog);
1492 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); 1462 encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
1493 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); 1463 encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
1494 RESERVE_SPACE(4); 1464 p = reserve_space(xdr, 4);
1495 WRITE32(setclientid->sc_cb_ident); 1465 *p = cpu_to_be32(setclientid->sc_cb_ident);
1496 hdr->nops++; 1466 hdr->nops++;
1497 hdr->replen += decode_setclientid_maxsz; 1467 hdr->replen += decode_setclientid_maxsz;
1498} 1468}
@@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
1501{ 1471{
1502 __be32 *p; 1472 __be32 *p;
1503 1473
1504 RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); 1474 p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
1505 WRITE32(OP_SETCLIENTID_CONFIRM); 1475 *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
1506 WRITE64(client_state->cl_clientid); 1476 p = xdr_encode_hyper(p, client_state->cl_clientid);
1507 WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); 1477 xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
1508 hdr->nops++; 1478 hdr->nops++;
1509 hdr->replen += decode_setclientid_confirm_maxsz; 1479 hdr->replen += decode_setclientid_confirm_maxsz;
1510} 1480}
@@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1513{ 1483{
1514 __be32 *p; 1484 __be32 *p;
1515 1485
1516 RESERVE_SPACE(4); 1486 p = reserve_space(xdr, 4);
1517 WRITE32(OP_WRITE); 1487 *p = cpu_to_be32(OP_WRITE);
1518 1488
1519 encode_stateid(xdr, args->context); 1489 encode_stateid(xdr, args->context);
1520 1490
1521 RESERVE_SPACE(16); 1491 p = reserve_space(xdr, 16);
1522 WRITE64(args->offset); 1492 p = xdr_encode_hyper(p, args->offset);
1523 WRITE32(args->stable); 1493 *p++ = cpu_to_be32(args->stable);
1524 WRITE32(args->count); 1494 *p = cpu_to_be32(args->count);
1525 1495
1526 xdr_write_pages(xdr, args->pages, args->pgbase, args->count); 1496 xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
1527 hdr->nops++; 1497 hdr->nops++;
@@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1532{ 1502{
1533 __be32 *p; 1503 __be32 *p;
1534 1504
1535 RESERVE_SPACE(4+NFS4_STATEID_SIZE); 1505 p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
1536 1506
1537 WRITE32(OP_DELEGRETURN); 1507 *p++ = cpu_to_be32(OP_DELEGRETURN);
1538 WRITEMEM(stateid->data, NFS4_STATEID_SIZE); 1508 xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
1539 hdr->nops++; 1509 hdr->nops++;
1540 hdr->replen += decode_delegreturn_maxsz; 1510 hdr->replen += decode_delegreturn_maxsz;
1541} 1511}
@@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
1548{ 1518{
1549 __be32 *p; 1519 __be32 *p;
1550 1520
1551 RESERVE_SPACE(4 + sizeof(args->verifier->data)); 1521 p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
1552 WRITE32(OP_EXCHANGE_ID); 1522 *p++ = cpu_to_be32(OP_EXCHANGE_ID);
1553 WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); 1523 xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
1554 1524
1555 encode_string(xdr, args->id_len, args->id); 1525 encode_string(xdr, args->id_len, args->id);
1556 1526
1557 RESERVE_SPACE(12); 1527 p = reserve_space(xdr, 12);
1558 WRITE32(args->flags); 1528 *p++ = cpu_to_be32(args->flags);
1559 WRITE32(0); /* zero length state_protect4_a */ 1529 *p++ = cpu_to_be32(0); /* zero length state_protect4_a */
1560 WRITE32(0); /* zero length implementation id array */ 1530 *p = cpu_to_be32(0); /* zero length implementation id array */
1561 hdr->nops++; 1531 hdr->nops++;
1562 hdr->replen += decode_exchange_id_maxsz; 1532 hdr->replen += decode_exchange_id_maxsz;
1563} 1533}
@@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr,
1571 uint32_t len; 1541 uint32_t len;
1572 struct nfs_client *clp = args->client; 1542 struct nfs_client *clp = args->client;
1573 1543
1574 RESERVE_SPACE(4); 1544 len = scnprintf(machine_name, sizeof(machine_name), "%s",
1575 WRITE32(OP_CREATE_SESSION); 1545 clp->cl_ipaddr);
1576
1577 RESERVE_SPACE(8);
1578 WRITE64(clp->cl_ex_clid);
1579 1546
1580 RESERVE_SPACE(8); 1547 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
1581 WRITE32(clp->cl_seqid); /*Sequence id */ 1548 *p++ = cpu_to_be32(OP_CREATE_SESSION);
1582 WRITE32(args->flags); /*flags */ 1549 p = xdr_encode_hyper(p, clp->cl_ex_clid);
1550 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1551 *p++ = cpu_to_be32(args->flags); /*flags */
1583 1552
1584 RESERVE_SPACE(2*28); /* 2 channel_attrs */
1585 /* Fore Channel */ 1553 /* Fore Channel */
1586 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1554 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1587 WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ 1555 *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
1588 WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ 1556 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
1589 WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1557 *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1590 WRITE32(args->fc_attrs.max_ops); /* max operations */ 1558 *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */
1591 WRITE32(args->fc_attrs.max_reqs); /* max requests */ 1559 *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */
1592 WRITE32(0); /* rdmachannel_attrs */ 1560 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1593 1561
1594 /* Back Channel */ 1562 /* Back Channel */
1595 WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ 1563 *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
1596 WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ 1564 *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
1597 WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ 1565 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
1598 WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ 1566 *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */
1599 WRITE32(args->bc_attrs.max_ops); /* max operations */ 1567 *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */
1600 WRITE32(args->bc_attrs.max_reqs); /* max requests */ 1568 *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */
1601 WRITE32(0); /* rdmachannel_attrs */ 1569 *p++ = cpu_to_be32(0); /* rdmachannel_attrs */
1602 1570
1603 RESERVE_SPACE(4); 1571 *p++ = cpu_to_be32(args->cb_program); /* cb_program */
1604 WRITE32(args->cb_program); /* cb_program */ 1572 *p++ = cpu_to_be32(1);
1605 1573 *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */
1606 RESERVE_SPACE(4); /* # of security flavors */
1607 WRITE32(1);
1608
1609 RESERVE_SPACE(4);
1610 WRITE32(RPC_AUTH_UNIX); /* auth_sys */
1611 1574
1612 /* authsys_parms rfc1831 */ 1575 /* authsys_parms rfc1831 */
1613 RESERVE_SPACE(4); 1576 *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */
1614 WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ 1577 p = xdr_encode_opaque(p, machine_name, len);
1615 len = scnprintf(machine_name, sizeof(machine_name), "%s", 1578 *p++ = cpu_to_be32(0); /* UID */
1616 clp->cl_ipaddr); 1579 *p++ = cpu_to_be32(0); /* GID */
1617 RESERVE_SPACE(16 + len); 1580 *p = cpu_to_be32(0); /* No more gids */
1618 WRITE32(len);
1619 WRITEMEM(machine_name, len);
1620 WRITE32(0); /* UID */
1621 WRITE32(0); /* GID */
1622 WRITE32(0); /* No more gids */
1623 hdr->nops++; 1581 hdr->nops++;
1624 hdr->replen += decode_create_session_maxsz; 1582 hdr->replen += decode_create_session_maxsz;
1625} 1583}
@@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
1629 struct compound_hdr *hdr) 1587 struct compound_hdr *hdr)
1630{ 1588{
1631 __be32 *p; 1589 __be32 *p;
1632 RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); 1590 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
1633 WRITE32(OP_DESTROY_SESSION); 1591 *p++ = cpu_to_be32(OP_DESTROY_SESSION);
1634 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1592 xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1635 hdr->nops++; 1593 hdr->nops++;
1636 hdr->replen += decode_destroy_session_maxsz; 1594 hdr->replen += decode_destroy_session_maxsz;
1637} 1595}
@@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr,
1655 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); 1613 WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
1656 slot = tp->slots + args->sa_slotid; 1614 slot = tp->slots + args->sa_slotid;
1657 1615
1658 RESERVE_SPACE(4); 1616 p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
1659 WRITE32(OP_SEQUENCE); 1617 *p++ = cpu_to_be32(OP_SEQUENCE);
1660 1618
1661 /* 1619 /*
1662 * Sessionid + seqid + slotid + max slotid + cache_this 1620 * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr,
1670 ((u32 *)session->sess_id.data)[3], 1628 ((u32 *)session->sess_id.data)[3],
1671 slot->seq_nr, args->sa_slotid, 1629 slot->seq_nr, args->sa_slotid,
1672 tp->highest_used_slotid, args->sa_cache_this); 1630 tp->highest_used_slotid, args->sa_cache_this);
1673 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); 1631 p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
1674 WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); 1632 *p++ = cpu_to_be32(slot->seq_nr);
1675 WRITE32(slot->seq_nr); 1633 *p++ = cpu_to_be32(args->sa_slotid);
1676 WRITE32(args->sa_slotid); 1634 *p++ = cpu_to_be32(tp->highest_used_slotid);
1677 WRITE32(tp->highest_used_slotid); 1635 *p = cpu_to_be32(args->sa_cache_this);
1678 WRITE32(args->sa_cache_this);
1679 hdr->nops++; 1636 hdr->nops++;
1680 hdr->replen += decode_sequence_maxsz; 1637 hdr->replen += decode_sequence_maxsz;
1681#endif /* CONFIG_NFS_V4_1 */ 1638#endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
2466} 2423}
2467#endif /* CONFIG_NFS_V4_1 */ 2424#endif /* CONFIG_NFS_V4_1 */
2468 2425
2469/* 2426static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
2470 * START OF "GENERIC" DECODE ROUTINES. 2427{
2471 * These may look a little ugly since they are imported from a "generic" 2428 dprintk("nfs: %s: prematurely hit end of receive buffer. "
2472 * set of XDR encode/decode routines which are intended to be shared by 2429 "Remaining buffer length is %tu words.\n",
2473 * all of our NFSv4 implementations (OpenBSD, MacOS X...). 2430 func, xdr->end - xdr->p);
2474 * 2431}
2475 * If the pain of reading these is too great, it should be a straightforward
2476 * task to translate them into Linux-specific versions which are more
2477 * consistent with the style used in NFSv2/v3...
2478 */
2479#define READ32(x) (x) = ntohl(*p++)
2480#define READ64(x) do { \
2481 (x) = (u64)ntohl(*p++) << 32; \
2482 (x) |= ntohl(*p++); \
2483} while (0)
2484#define READTIME(x) do { \
2485 p++; \
2486 (x.tv_sec) = ntohl(*p++); \
2487 (x.tv_nsec) = ntohl(*p++); \
2488} while (0)
2489#define COPYMEM(x,nbytes) do { \
2490 memcpy((x), p, nbytes); \
2491 p += XDR_QUADLEN(nbytes); \
2492} while (0)
2493
2494#define READ_BUF(nbytes) do { \
2495 p = xdr_inline_decode(xdr, nbytes); \
2496 if (unlikely(!p)) { \
2497 dprintk("nfs: %s: prematurely hit end of receive" \
2498 " buffer\n", __func__); \
2499 dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
2500 __func__, xdr->p, nbytes, xdr->end); \
2501 return -EIO; \
2502 } \
2503} while (0)
2504 2432
2505static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) 2433static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
2506{ 2434{
2507 __be32 *p; 2435 __be32 *p;
2508 2436
2509 READ_BUF(4); 2437 p = xdr_inline_decode(xdr, 4);
2510 READ32(*len); 2438 if (unlikely(!p))
2511 READ_BUF(*len); 2439 goto out_overflow;
2440 *len = be32_to_cpup(p);
2441 p = xdr_inline_decode(xdr, *len);
2442 if (unlikely(!p))
2443 goto out_overflow;
2512 *string = (char *)p; 2444 *string = (char *)p;
2513 return 0; 2445 return 0;
2446out_overflow:
2447 print_overflow_msg(__func__, xdr);
2448 return -EIO;
2514} 2449}
2515 2450
2516static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) 2451static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
2517{ 2452{
2518 __be32 *p; 2453 __be32 *p;
2519 2454
2520 READ_BUF(8); 2455 p = xdr_inline_decode(xdr, 8);
2521 READ32(hdr->status); 2456 if (unlikely(!p))
2522 READ32(hdr->taglen); 2457 goto out_overflow;
2458 hdr->status = be32_to_cpup(p++);
2459 hdr->taglen = be32_to_cpup(p);
2523 2460
2524 READ_BUF(hdr->taglen + 4); 2461 p = xdr_inline_decode(xdr, hdr->taglen + 4);
2462 if (unlikely(!p))
2463 goto out_overflow;
2525 hdr->tag = (char *)p; 2464 hdr->tag = (char *)p;
2526 p += XDR_QUADLEN(hdr->taglen); 2465 p += XDR_QUADLEN(hdr->taglen);
2527 READ32(hdr->nops); 2466 hdr->nops = be32_to_cpup(p);
2528 if (unlikely(hdr->nops < 1)) 2467 if (unlikely(hdr->nops < 1))
2529 return nfs4_stat_to_errno(hdr->status); 2468 return nfs4_stat_to_errno(hdr->status);
2530 return 0; 2469 return 0;
2470out_overflow:
2471 print_overflow_msg(__func__, xdr);
2472 return -EIO;
2531} 2473}
2532 2474
2533static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) 2475static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
2536 uint32_t opnum; 2478 uint32_t opnum;
2537 int32_t nfserr; 2479 int32_t nfserr;
2538 2480
2539 READ_BUF(8); 2481 p = xdr_inline_decode(xdr, 8);
2540 READ32(opnum); 2482 if (unlikely(!p))
2483 goto out_overflow;
2484 opnum = be32_to_cpup(p++);
2541 if (opnum != expected) { 2485 if (opnum != expected) {
2542 dprintk("nfs: Server returned operation" 2486 dprintk("nfs: Server returned operation"
2543 " %d but we issued a request for %d\n", 2487 " %d but we issued a request for %d\n",
2544 opnum, expected); 2488 opnum, expected);
2545 return -EIO; 2489 return -EIO;
2546 } 2490 }
2547 READ32(nfserr); 2491 nfserr = be32_to_cpup(p);
2548 if (nfserr != NFS_OK) 2492 if (nfserr != NFS_OK)
2549 return nfs4_stat_to_errno(nfserr); 2493 return nfs4_stat_to_errno(nfserr);
2550 return 0; 2494 return 0;
2495out_overflow:
2496 print_overflow_msg(__func__, xdr);
2497 return -EIO;
2551} 2498}
2552 2499
2553/* Dummy routine */ 2500/* Dummy routine */
@@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
2557 unsigned int strlen; 2504 unsigned int strlen;
2558 char *str; 2505 char *str;
2559 2506
2560 READ_BUF(12); 2507 p = xdr_inline_decode(xdr, 12);
2561 return decode_opaque_inline(xdr, &strlen, &str); 2508 if (likely(p))
2509 return decode_opaque_inline(xdr, &strlen, &str);
2510 print_overflow_msg(__func__, xdr);
2511 return -EIO;
2562} 2512}
2563 2513
2564static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) 2514static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
2566 uint32_t bmlen; 2516 uint32_t bmlen;
2567 __be32 *p; 2517 __be32 *p;
2568 2518
2569 READ_BUF(4); 2519 p = xdr_inline_decode(xdr, 4);
2570 READ32(bmlen); 2520 if (unlikely(!p))
2521 goto out_overflow;
2522 bmlen = be32_to_cpup(p);
2571 2523
2572 bitmap[0] = bitmap[1] = 0; 2524 bitmap[0] = bitmap[1] = 0;
2573 READ_BUF((bmlen << 2)); 2525 p = xdr_inline_decode(xdr, (bmlen << 2));
2526 if (unlikely(!p))
2527 goto out_overflow;
2574 if (bmlen > 0) { 2528 if (bmlen > 0) {
2575 READ32(bitmap[0]); 2529 bitmap[0] = be32_to_cpup(p++);
2576 if (bmlen > 1) 2530 if (bmlen > 1)
2577 READ32(bitmap[1]); 2531 bitmap[1] = be32_to_cpup(p);
2578 } 2532 }
2579 return 0; 2533 return 0;
2534out_overflow:
2535 print_overflow_msg(__func__, xdr);
2536 return -EIO;
2580} 2537}
2581 2538
2582static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) 2539static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
2583{ 2540{
2584 __be32 *p; 2541 __be32 *p;
2585 2542
2586 READ_BUF(4); 2543 p = xdr_inline_decode(xdr, 4);
2587 READ32(*attrlen); 2544 if (unlikely(!p))
2545 goto out_overflow;
2546 *attrlen = be32_to_cpup(p);
2588 *savep = xdr->p; 2547 *savep = xdr->p;
2589 return 0; 2548 return 0;
2549out_overflow:
2550 print_overflow_msg(__func__, xdr);
2551 return -EIO;
2590} 2552}
2591 2553
2592static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) 2554static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2609 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) 2571 if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
2610 return -EIO; 2572 return -EIO;
2611 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { 2573 if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
2612 READ_BUF(4); 2574 p = xdr_inline_decode(xdr, 4);
2613 READ32(*type); 2575 if (unlikely(!p))
2576 goto out_overflow;
2577 *type = be32_to_cpup(p);
2614 if (*type < NF4REG || *type > NF4NAMEDATTR) { 2578 if (*type < NF4REG || *type > NF4NAMEDATTR) {
2615 dprintk("%s: bad type %d\n", __func__, *type); 2579 dprintk("%s: bad type %d\n", __func__, *type);
2616 return -EIO; 2580 return -EIO;
@@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
2620 } 2584 }
2621 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); 2585 dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
2622 return ret; 2586 return ret;
2587out_overflow:
2588 print_overflow_msg(__func__, xdr);
2589 return -EIO;
2623} 2590}
2624 2591
2625static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) 2592static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2631 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) 2598 if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
2632 return -EIO; 2599 return -EIO;
2633 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { 2600 if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
2634 READ_BUF(8); 2601 p = xdr_inline_decode(xdr, 8);
2635 READ64(*change); 2602 if (unlikely(!p))
2603 goto out_overflow;
2604 xdr_decode_hyper(p, change);
2636 bitmap[0] &= ~FATTR4_WORD0_CHANGE; 2605 bitmap[0] &= ~FATTR4_WORD0_CHANGE;
2637 ret = NFS_ATTR_FATTR_CHANGE; 2606 ret = NFS_ATTR_FATTR_CHANGE;
2638 } 2607 }
2639 dprintk("%s: change attribute=%Lu\n", __func__, 2608 dprintk("%s: change attribute=%Lu\n", __func__,
2640 (unsigned long long)*change); 2609 (unsigned long long)*change);
2641 return ret; 2610 return ret;
2611out_overflow:
2612 print_overflow_msg(__func__, xdr);
2613 return -EIO;
2642} 2614}
2643 2615
2644static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) 2616static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
2650 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) 2622 if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
2651 return -EIO; 2623 return -EIO;
2652 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { 2624 if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
2653 READ_BUF(8); 2625 p = xdr_inline_decode(xdr, 8);
2654 READ64(*size); 2626 if (unlikely(!p))
2627 goto out_overflow;
2628 xdr_decode_hyper(p, size);
2655 bitmap[0] &= ~FATTR4_WORD0_SIZE; 2629 bitmap[0] &= ~FATTR4_WORD0_SIZE;
2656 ret = NFS_ATTR_FATTR_SIZE; 2630 ret = NFS_ATTR_FATTR_SIZE;
2657 } 2631 }
2658 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); 2632 dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
2659 return ret; 2633 return ret;
2634out_overflow:
2635 print_overflow_msg(__func__, xdr);
2636 return -EIO;
2660} 2637}
2661 2638
2662static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2639static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
2667 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) 2644 if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
2668 return -EIO; 2645 return -EIO;
2669 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { 2646 if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
2670 READ_BUF(4); 2647 p = xdr_inline_decode(xdr, 4);
2671 READ32(*res); 2648 if (unlikely(!p))
2649 goto out_overflow;
2650 *res = be32_to_cpup(p);
2672 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; 2651 bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
2673 } 2652 }
2674 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); 2653 dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
2675 return 0; 2654 return 0;
2655out_overflow:
2656 print_overflow_msg(__func__, xdr);
2657 return -EIO;
2676} 2658}
2677 2659
2678static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2660static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
2683 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) 2665 if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
2684 return -EIO; 2666 return -EIO;
2685 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { 2667 if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
2686 READ_BUF(4); 2668 p = xdr_inline_decode(xdr, 4);
2687 READ32(*res); 2669 if (unlikely(!p))
2670 goto out_overflow;
2671 *res = be32_to_cpup(p);
2688 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; 2672 bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
2689 } 2673 }
2690 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); 2674 dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
2691 return 0; 2675 return 0;
2676out_overflow:
2677 print_overflow_msg(__func__, xdr);
2678 return -EIO;
2692} 2679}
2693 2680
2694static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) 2681static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2701 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) 2688 if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
2702 return -EIO; 2689 return -EIO;
2703 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { 2690 if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
2704 READ_BUF(16); 2691 p = xdr_inline_decode(xdr, 16);
2705 READ64(fsid->major); 2692 if (unlikely(!p))
2706 READ64(fsid->minor); 2693 goto out_overflow;
2694 p = xdr_decode_hyper(p, &fsid->major);
2695 xdr_decode_hyper(p, &fsid->minor);
2707 bitmap[0] &= ~FATTR4_WORD0_FSID; 2696 bitmap[0] &= ~FATTR4_WORD0_FSID;
2708 ret = NFS_ATTR_FATTR_FSID; 2697 ret = NFS_ATTR_FATTR_FSID;
2709 } 2698 }
@@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
2711 (unsigned long long)fsid->major, 2700 (unsigned long long)fsid->major,
2712 (unsigned long long)fsid->minor); 2701 (unsigned long long)fsid->minor);
2713 return ret; 2702 return ret;
2703out_overflow:
2704 print_overflow_msg(__func__, xdr);
2705 return -EIO;
2714} 2706}
2715 2707
2716static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2708static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
2721 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) 2713 if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
2722 return -EIO; 2714 return -EIO;
2723 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { 2715 if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
2724 READ_BUF(4); 2716 p = xdr_inline_decode(xdr, 4);
2725 READ32(*res); 2717 if (unlikely(!p))
2718 goto out_overflow;
2719 *res = be32_to_cpup(p);
2726 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; 2720 bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
2727 } 2721 }
2728 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); 2722 dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
2729 return 0; 2723 return 0;
2724out_overflow:
2725 print_overflow_msg(__func__, xdr);
2726 return -EIO;
2730} 2727}
2731 2728
2732static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 2729static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
2737 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) 2734 if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
2738 return -EIO; 2735 return -EIO;
2739 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { 2736 if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
2740 READ_BUF(4); 2737 p = xdr_inline_decode(xdr, 4);
2741 READ32(*res); 2738 if (unlikely(!p))
2739 goto out_overflow;
2740 *res = be32_to_cpup(p);
2742 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; 2741 bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
2743 } 2742 }
2744 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); 2743 dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
2745 return 0; 2744 return 0;
2745out_overflow:
2746 print_overflow_msg(__func__, xdr);
2747 return -EIO;
2746} 2748}
2747 2749
2748static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2750static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
2754 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) 2756 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
2755 return -EIO; 2757 return -EIO;
2756 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { 2758 if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
2757 READ_BUF(8); 2759 p = xdr_inline_decode(xdr, 8);
2758 READ64(*fileid); 2760 if (unlikely(!p))
2761 goto out_overflow;
2762 xdr_decode_hyper(p, fileid);
2759 bitmap[0] &= ~FATTR4_WORD0_FILEID; 2763 bitmap[0] &= ~FATTR4_WORD0_FILEID;
2760 ret = NFS_ATTR_FATTR_FILEID; 2764 ret = NFS_ATTR_FATTR_FILEID;
2761 } 2765 }
2762 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2766 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2763 return ret; 2767 return ret;
2768out_overflow:
2769 print_overflow_msg(__func__, xdr);
2770 return -EIO;
2764} 2771}
2765 2772
2766static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) 2773static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
2772 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) 2779 if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
2773 return -EIO; 2780 return -EIO;
2774 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { 2781 if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
2775 READ_BUF(8); 2782 p = xdr_inline_decode(xdr, 8);
2776 READ64(*fileid); 2783 if (unlikely(!p))
2784 goto out_overflow;
2785 xdr_decode_hyper(p, fileid);
2777 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; 2786 bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
2778 ret = NFS_ATTR_FATTR_FILEID; 2787 ret = NFS_ATTR_FATTR_FILEID;
2779 } 2788 }
2780 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); 2789 dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
2781 return ret; 2790 return ret;
2791out_overflow:
2792 print_overflow_msg(__func__, xdr);
2793 return -EIO;
2782} 2794}
2783 2795
2784static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2796static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
2790 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) 2802 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
2791 return -EIO; 2803 return -EIO;
2792 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { 2804 if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
2793 READ_BUF(8); 2805 p = xdr_inline_decode(xdr, 8);
2794 READ64(*res); 2806 if (unlikely(!p))
2807 goto out_overflow;
2808 xdr_decode_hyper(p, res);
2795 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; 2809 bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
2796 } 2810 }
2797 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); 2811 dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
2798 return status; 2812 return status;
2813out_overflow:
2814 print_overflow_msg(__func__, xdr);
2815 return -EIO;
2799} 2816}
2800 2817
2801static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2818static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
2807 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) 2824 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
2808 return -EIO; 2825 return -EIO;
2809 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { 2826 if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
2810 READ_BUF(8); 2827 p = xdr_inline_decode(xdr, 8);
2811 READ64(*res); 2828 if (unlikely(!p))
2829 goto out_overflow;
2830 xdr_decode_hyper(p, res);
2812 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; 2831 bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
2813 } 2832 }
2814 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); 2833 dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
2815 return status; 2834 return status;
2835out_overflow:
2836 print_overflow_msg(__func__, xdr);
2837 return -EIO;
2816} 2838}
2817 2839
2818static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 2840static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
2824 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) 2846 if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
2825 return -EIO; 2847 return -EIO;
2826 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { 2848 if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
2827 READ_BUF(8); 2849 p = xdr_inline_decode(xdr, 8);
2828 READ64(*res); 2850 if (unlikely(!p))
2851 goto out_overflow;
2852 xdr_decode_hyper(p, res);
2829 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; 2853 bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
2830 } 2854 }
2831 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); 2855 dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
2832 return status; 2856 return status;
2857out_overflow:
2858 print_overflow_msg(__func__, xdr);
2859 return -EIO;
2833} 2860}
2834 2861
2835static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) 2862static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
2838 __be32 *p; 2865 __be32 *p;
2839 int status = 0; 2866 int status = 0;
2840 2867
2841 READ_BUF(4); 2868 p = xdr_inline_decode(xdr, 4);
2842 READ32(n); 2869 if (unlikely(!p))
2870 goto out_overflow;
2871 n = be32_to_cpup(p);
2843 if (n == 0) 2872 if (n == 0)
2844 goto root_path; 2873 goto root_path;
2845 dprintk("path "); 2874 dprintk("path ");
@@ -2873,6 +2902,9 @@ out_eio:
2873 dprintk(" status %d", status); 2902 dprintk(" status %d", status);
2874 status = -EIO; 2903 status = -EIO;
2875 goto out; 2904 goto out;
2905out_overflow:
2906 print_overflow_msg(__func__, xdr);
2907 return -EIO;
2876} 2908}
2877 2909
2878static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) 2910static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2890 status = decode_pathname(xdr, &res->fs_path); 2922 status = decode_pathname(xdr, &res->fs_path);
2891 if (unlikely(status != 0)) 2923 if (unlikely(status != 0))
2892 goto out; 2924 goto out;
2893 READ_BUF(4); 2925 p = xdr_inline_decode(xdr, 4);
2894 READ32(n); 2926 if (unlikely(!p))
2927 goto out_overflow;
2928 n = be32_to_cpup(p);
2895 if (n <= 0) 2929 if (n <= 0)
2896 goto out_eio; 2930 goto out_eio;
2897 res->nlocations = 0; 2931 res->nlocations = 0;
@@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2899 u32 m; 2933 u32 m;
2900 struct nfs4_fs_location *loc = &res->locations[res->nlocations]; 2934 struct nfs4_fs_location *loc = &res->locations[res->nlocations];
2901 2935
2902 READ_BUF(4); 2936 p = xdr_inline_decode(xdr, 4);
2903 READ32(m); 2937 if (unlikely(!p))
2938 goto out_overflow;
2939 m = be32_to_cpup(p);
2904 2940
2905 loc->nservers = 0; 2941 loc->nservers = 0;
2906 dprintk("%s: servers ", __func__); 2942 dprintk("%s: servers ", __func__);
@@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
2939out: 2975out:
2940 dprintk("%s: fs_locations done, error = %d\n", __func__, status); 2976 dprintk("%s: fs_locations done, error = %d\n", __func__, status);
2941 return status; 2977 return status;
2978out_overflow:
2979 print_overflow_msg(__func__, xdr);
2942out_eio: 2980out_eio:
2943 status = -EIO; 2981 status = -EIO;
2944 goto out; 2982 goto out;
@@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
2953 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) 2991 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
2954 return -EIO; 2992 return -EIO;
2955 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { 2993 if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
2956 READ_BUF(8); 2994 p = xdr_inline_decode(xdr, 8);
2957 READ64(*res); 2995 if (unlikely(!p))
2996 goto out_overflow;
2997 xdr_decode_hyper(p, res);
2958 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; 2998 bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
2959 } 2999 }
2960 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); 3000 dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
2961 return status; 3001 return status;
3002out_overflow:
3003 print_overflow_msg(__func__, xdr);
3004 return -EIO;
2962} 3005}
2963 3006
2964static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) 3007static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2970 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) 3013 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
2971 return -EIO; 3014 return -EIO;
2972 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { 3015 if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
2973 READ_BUF(4); 3016 p = xdr_inline_decode(xdr, 4);
2974 READ32(*maxlink); 3017 if (unlikely(!p))
3018 goto out_overflow;
3019 *maxlink = be32_to_cpup(p);
2975 bitmap[0] &= ~FATTR4_WORD0_MAXLINK; 3020 bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
2976 } 3021 }
2977 dprintk("%s: maxlink=%u\n", __func__, *maxlink); 3022 dprintk("%s: maxlink=%u\n", __func__, *maxlink);
2978 return status; 3023 return status;
3024out_overflow:
3025 print_overflow_msg(__func__, xdr);
3026 return -EIO;
2979} 3027}
2980 3028
2981static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) 3029static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
2987 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) 3035 if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
2988 return -EIO; 3036 return -EIO;
2989 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { 3037 if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
2990 READ_BUF(4); 3038 p = xdr_inline_decode(xdr, 4);
2991 READ32(*maxname); 3039 if (unlikely(!p))
3040 goto out_overflow;
3041 *maxname = be32_to_cpup(p);
2992 bitmap[0] &= ~FATTR4_WORD0_MAXNAME; 3042 bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
2993 } 3043 }
2994 dprintk("%s: maxname=%u\n", __func__, *maxname); 3044 dprintk("%s: maxname=%u\n", __func__, *maxname);
2995 return status; 3045 return status;
3046out_overflow:
3047 print_overflow_msg(__func__, xdr);
3048 return -EIO;
2996} 3049}
2997 3050
2998static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3051static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3005 return -EIO; 3058 return -EIO;
3006 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { 3059 if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
3007 uint64_t maxread; 3060 uint64_t maxread;
3008 READ_BUF(8); 3061 p = xdr_inline_decode(xdr, 8);
3009 READ64(maxread); 3062 if (unlikely(!p))
3063 goto out_overflow;
3064 xdr_decode_hyper(p, &maxread);
3010 if (maxread > 0x7FFFFFFF) 3065 if (maxread > 0x7FFFFFFF)
3011 maxread = 0x7FFFFFFF; 3066 maxread = 0x7FFFFFFF;
3012 *res = (uint32_t)maxread; 3067 *res = (uint32_t)maxread;
@@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
3014 } 3069 }
3015 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); 3070 dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
3016 return status; 3071 return status;
3072out_overflow:
3073 print_overflow_msg(__func__, xdr);
3074 return -EIO;
3017} 3075}
3018 3076
3019static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) 3077static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3026 return -EIO; 3084 return -EIO;
3027 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { 3085 if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
3028 uint64_t maxwrite; 3086 uint64_t maxwrite;
3029 READ_BUF(8); 3087 p = xdr_inline_decode(xdr, 8);
3030 READ64(maxwrite); 3088 if (unlikely(!p))
3089 goto out_overflow;
3090 xdr_decode_hyper(p, &maxwrite);
3031 if (maxwrite > 0x7FFFFFFF) 3091 if (maxwrite > 0x7FFFFFFF)
3032 maxwrite = 0x7FFFFFFF; 3092 maxwrite = 0x7FFFFFFF;
3033 *res = (uint32_t)maxwrite; 3093 *res = (uint32_t)maxwrite;
@@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
3035 } 3095 }
3036 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); 3096 dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
3037 return status; 3097 return status;
3098out_overflow:
3099 print_overflow_msg(__func__, xdr);
3100 return -EIO;
3038} 3101}
3039 3102
3040static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) 3103static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
3047 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) 3110 if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
3048 return -EIO; 3111 return -EIO;
3049 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { 3112 if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
3050 READ_BUF(4); 3113 p = xdr_inline_decode(xdr, 4);
3051 READ32(tmp); 3114 if (unlikely(!p))
3115 goto out_overflow;
3116 tmp = be32_to_cpup(p);
3052 *mode = tmp & ~S_IFMT; 3117 *mode = tmp & ~S_IFMT;
3053 bitmap[1] &= ~FATTR4_WORD1_MODE; 3118 bitmap[1] &= ~FATTR4_WORD1_MODE;
3054 ret = NFS_ATTR_FATTR_MODE; 3119 ret = NFS_ATTR_FATTR_MODE;
3055 } 3120 }
3056 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); 3121 dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
3057 return ret; 3122 return ret;
3123out_overflow:
3124 print_overflow_msg(__func__, xdr);
3125 return -EIO;
3058} 3126}
3059 3127
3060static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) 3128static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
3066 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) 3134 if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
3067 return -EIO; 3135 return -EIO;
3068 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { 3136 if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
3069 READ_BUF(4); 3137 p = xdr_inline_decode(xdr, 4);
3070 READ32(*nlink); 3138 if (unlikely(!p))
3139 goto out_overflow;
3140 *nlink = be32_to_cpup(p);
3071 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; 3141 bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
3072 ret = NFS_ATTR_FATTR_NLINK; 3142 ret = NFS_ATTR_FATTR_NLINK;
3073 } 3143 }
3074 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); 3144 dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
3075 return ret; 3145 return ret;
3146out_overflow:
3147 print_overflow_msg(__func__, xdr);
3148 return -EIO;
3076} 3149}
3077 3150
3078static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) 3151static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3152 struct nfs_client *clp, uint32_t *uid, int may_sleep)
3079{ 3153{
3080 uint32_t len; 3154 uint32_t len;
3081 __be32 *p; 3155 __be32 *p;
@@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3085 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) 3159 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
3086 return -EIO; 3160 return -EIO;
3087 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { 3161 if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
3088 READ_BUF(4); 3162 p = xdr_inline_decode(xdr, 4);
3089 READ32(len); 3163 if (unlikely(!p))
3090 READ_BUF(len); 3164 goto out_overflow;
3091 if (len < XDR_MAX_NETOBJ) { 3165 len = be32_to_cpup(p);
3166 p = xdr_inline_decode(xdr, len);
3167 if (unlikely(!p))
3168 goto out_overflow;
3169 if (!may_sleep) {
3170 /* do nothing */
3171 } else if (len < XDR_MAX_NETOBJ) {
3092 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3172 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
3093 ret = NFS_ATTR_FATTR_OWNER; 3173 ret = NFS_ATTR_FATTR_OWNER;
3094 else 3174 else
@@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3101 } 3181 }
3102 dprintk("%s: uid=%d\n", __func__, (int)*uid); 3182 dprintk("%s: uid=%d\n", __func__, (int)*uid);
3103 return ret; 3183 return ret;
3184out_overflow:
3185 print_overflow_msg(__func__, xdr);
3186 return -EIO;
3104} 3187}
3105 3188
3106static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) 3189static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3190 struct nfs_client *clp, uint32_t *gid, int may_sleep)
3107{ 3191{
3108 uint32_t len; 3192 uint32_t len;
3109 __be32 *p; 3193 __be32 *p;
@@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3113 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) 3197 if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
3114 return -EIO; 3198 return -EIO;
3115 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { 3199 if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
3116 READ_BUF(4); 3200 p = xdr_inline_decode(xdr, 4);
3117 READ32(len); 3201 if (unlikely(!p))
3118 READ_BUF(len); 3202 goto out_overflow;
3119 if (len < XDR_MAX_NETOBJ) { 3203 len = be32_to_cpup(p);
3204 p = xdr_inline_decode(xdr, len);
3205 if (unlikely(!p))
3206 goto out_overflow;
3207 if (!may_sleep) {
3208 /* do nothing */
3209 } else if (len < XDR_MAX_NETOBJ) {
3120 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3210 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
3121 ret = NFS_ATTR_FATTR_GROUP; 3211 ret = NFS_ATTR_FATTR_GROUP;
3122 else 3212 else
@@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
3129 } 3219 }
3130 dprintk("%s: gid=%d\n", __func__, (int)*gid); 3220 dprintk("%s: gid=%d\n", __func__, (int)*gid);
3131 return ret; 3221 return ret;
3222out_overflow:
3223 print_overflow_msg(__func__, xdr);
3224 return -EIO;
3132} 3225}
3133 3226
3134static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) 3227static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3143 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { 3236 if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
3144 dev_t tmp; 3237 dev_t tmp;
3145 3238
3146 READ_BUF(8); 3239 p = xdr_inline_decode(xdr, 8);
3147 READ32(major); 3240 if (unlikely(!p))
3148 READ32(minor); 3241 goto out_overflow;
3242 major = be32_to_cpup(p++);
3243 minor = be32_to_cpup(p);
3149 tmp = MKDEV(major, minor); 3244 tmp = MKDEV(major, minor);
3150 if (MAJOR(tmp) == major && MINOR(tmp) == minor) 3245 if (MAJOR(tmp) == major && MINOR(tmp) == minor)
3151 *rdev = tmp; 3246 *rdev = tmp;
@@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
3154 } 3249 }
3155 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); 3250 dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
3156 return ret; 3251 return ret;
3252out_overflow:
3253 print_overflow_msg(__func__, xdr);
3254 return -EIO;
3157} 3255}
3158 3256
3159static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3257static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
3165 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) 3263 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
3166 return -EIO; 3264 return -EIO;
3167 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { 3265 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
3168 READ_BUF(8); 3266 p = xdr_inline_decode(xdr, 8);
3169 READ64(*res); 3267 if (unlikely(!p))
3268 goto out_overflow;
3269 xdr_decode_hyper(p, res);
3170 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; 3270 bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
3171 } 3271 }
3172 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); 3272 dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
3173 return status; 3273 return status;
3274out_overflow:
3275 print_overflow_msg(__func__, xdr);
3276 return -EIO;
3174} 3277}
3175 3278
3176static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3279static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
3182 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) 3285 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
3183 return -EIO; 3286 return -EIO;
3184 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { 3287 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
3185 READ_BUF(8); 3288 p = xdr_inline_decode(xdr, 8);
3186 READ64(*res); 3289 if (unlikely(!p))
3290 goto out_overflow;
3291 xdr_decode_hyper(p, res);
3187 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; 3292 bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
3188 } 3293 }
3189 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); 3294 dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
3190 return status; 3295 return status;
3296out_overflow:
3297 print_overflow_msg(__func__, xdr);
3298 return -EIO;
3191} 3299}
3192 3300
3193static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) 3301static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
3199 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) 3307 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
3200 return -EIO; 3308 return -EIO;
3201 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { 3309 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
3202 READ_BUF(8); 3310 p = xdr_inline_decode(xdr, 8);
3203 READ64(*res); 3311 if (unlikely(!p))
3312 goto out_overflow;
3313 xdr_decode_hyper(p, res);
3204 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; 3314 bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
3205 } 3315 }
3206 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); 3316 dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
3207 return status; 3317 return status;
3318out_overflow:
3319 print_overflow_msg(__func__, xdr);
3320 return -EIO;
3208} 3321}
3209 3322
3210static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) 3323static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
3216 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) 3329 if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
3217 return -EIO; 3330 return -EIO;
3218 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { 3331 if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
3219 READ_BUF(8); 3332 p = xdr_inline_decode(xdr, 8);
3220 READ64(*used); 3333 if (unlikely(!p))
3334 goto out_overflow;
3335 xdr_decode_hyper(p, used);
3221 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; 3336 bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
3222 ret = NFS_ATTR_FATTR_SPACE_USED; 3337 ret = NFS_ATTR_FATTR_SPACE_USED;
3223 } 3338 }
3224 dprintk("%s: space used=%Lu\n", __func__, 3339 dprintk("%s: space used=%Lu\n", __func__,
3225 (unsigned long long)*used); 3340 (unsigned long long)*used);
3226 return ret; 3341 return ret;
3342out_overflow:
3343 print_overflow_msg(__func__, xdr);
3344 return -EIO;
3227} 3345}
3228 3346
3229static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) 3347static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
3232 uint64_t sec; 3350 uint64_t sec;
3233 uint32_t nsec; 3351 uint32_t nsec;
3234 3352
3235 READ_BUF(12); 3353 p = xdr_inline_decode(xdr, 12);
3236 READ64(sec); 3354 if (unlikely(!p))
3237 READ32(nsec); 3355 goto out_overflow;
3356 p = xdr_decode_hyper(p, &sec);
3357 nsec = be32_to_cpup(p);
3238 time->tv_sec = (time_t)sec; 3358 time->tv_sec = (time_t)sec;
3239 time->tv_nsec = (long)nsec; 3359 time->tv_nsec = (long)nsec;
3240 return 0; 3360 return 0;
3361out_overflow:
3362 print_overflow_msg(__func__, xdr);
3363 return -EIO;
3241} 3364}
3242 3365
3243static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) 3366static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
3315{ 3438{
3316 __be32 *p; 3439 __be32 *p;
3317 3440
3318 READ_BUF(20); 3441 p = xdr_inline_decode(xdr, 20);
3319 READ32(cinfo->atomic); 3442 if (unlikely(!p))
3320 READ64(cinfo->before); 3443 goto out_overflow;
3321 READ64(cinfo->after); 3444 cinfo->atomic = be32_to_cpup(p++);
3445 p = xdr_decode_hyper(p, &cinfo->before);
3446 xdr_decode_hyper(p, &cinfo->after);
3322 return 0; 3447 return 0;
3448out_overflow:
3449 print_overflow_msg(__func__, xdr);
3450 return -EIO;
3323} 3451}
3324 3452
3325static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) 3453static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
3331 status = decode_op_hdr(xdr, OP_ACCESS); 3459 status = decode_op_hdr(xdr, OP_ACCESS);
3332 if (status) 3460 if (status)
3333 return status; 3461 return status;
3334 READ_BUF(8); 3462 p = xdr_inline_decode(xdr, 8);
3335 READ32(supp); 3463 if (unlikely(!p))
3336 READ32(acc); 3464 goto out_overflow;
3465 supp = be32_to_cpup(p++);
3466 acc = be32_to_cpup(p);
3337 access->supported = supp; 3467 access->supported = supp;
3338 access->access = acc; 3468 access->access = acc;
3339 return 0; 3469 return 0;
3470out_overflow:
3471 print_overflow_msg(__func__, xdr);
3472 return -EIO;
3340} 3473}
3341 3474
3342static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) 3475static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
3343{ 3476{
3344 __be32 *p; 3477 __be32 *p;
3478
3479 p = xdr_inline_decode(xdr, len);
3480 if (likely(p)) {
3481 memcpy(buf, p, len);
3482 return 0;
3483 }
3484 print_overflow_msg(__func__, xdr);
3485 return -EIO;
3486}
3487
3488static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
3489{
3490 return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
3491}
3492
3493static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
3494{
3345 int status; 3495 int status;
3346 3496
3347 status = decode_op_hdr(xdr, OP_CLOSE); 3497 status = decode_op_hdr(xdr, OP_CLOSE);
3348 if (status != -EIO) 3498 if (status != -EIO)
3349 nfs_increment_open_seqid(status, res->seqid); 3499 nfs_increment_open_seqid(status, res->seqid);
3350 if (status) 3500 if (!status)
3351 return status; 3501 status = decode_stateid(xdr, &res->stateid);
3352 READ_BUF(NFS4_STATEID_SIZE); 3502 return status;
3353 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3503}
3354 return 0; 3504
3505static int decode_verifier(struct xdr_stream *xdr, void *verifier)
3506{
3507 return decode_opaque_fixed(xdr, verifier, 8);
3355} 3508}
3356 3509
3357static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) 3510static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
3358{ 3511{
3359 __be32 *p;
3360 int status; 3512 int status;
3361 3513
3362 status = decode_op_hdr(xdr, OP_COMMIT); 3514 status = decode_op_hdr(xdr, OP_COMMIT);
3363 if (status) 3515 if (!status)
3364 return status; 3516 status = decode_verifier(xdr, res->verf->verifier);
3365 READ_BUF(8); 3517 return status;
3366 COPYMEM(res->verf->verifier, 8);
3367 return 0;
3368} 3518}
3369 3519
3370static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3520static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
3378 return status; 3528 return status;
3379 if ((status = decode_change_info(xdr, cinfo))) 3529 if ((status = decode_change_info(xdr, cinfo)))
3380 return status; 3530 return status;
3381 READ_BUF(4); 3531 p = xdr_inline_decode(xdr, 4);
3382 READ32(bmlen); 3532 if (unlikely(!p))
3383 READ_BUF(bmlen << 2); 3533 goto out_overflow;
3384 return 0; 3534 bmlen = be32_to_cpup(p);
3535 p = xdr_inline_decode(xdr, bmlen << 2);
3536 if (likely(p))
3537 return 0;
3538out_overflow:
3539 print_overflow_msg(__func__, xdr);
3540 return -EIO;
3385} 3541}
3386 3542
3387static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) 3543static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3622,8 @@ xdr_error:
3466 return status; 3622 return status;
3467} 3623}
3468 3624
3469static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) 3625static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
3626 const struct nfs_server *server, int may_sleep)
3470{ 3627{
3471 __be32 *savep; 3628 __be32 *savep;
3472 uint32_t attrlen, 3629 uint32_t attrlen,
@@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
3538 goto xdr_error; 3695 goto xdr_error;
3539 fattr->valid |= status; 3696 fattr->valid |= status;
3540 3697
3541 status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); 3698 status = decode_attr_owner(xdr, bitmap, server->nfs_client,
3699 &fattr->uid, may_sleep);
3542 if (status < 0) 3700 if (status < 0)
3543 goto xdr_error; 3701 goto xdr_error;
3544 fattr->valid |= status; 3702 fattr->valid |= status;
3545 3703
3546 status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); 3704 status = decode_attr_group(xdr, bitmap, server->nfs_client,
3705 &fattr->gid, may_sleep);
3547 if (status < 0) 3706 if (status < 0)
3548 goto xdr_error; 3707 goto xdr_error;
3549 fattr->valid |= status; 3708 fattr->valid |= status;
@@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
3633 if (status) 3792 if (status)
3634 return status; 3793 return status;
3635 3794
3636 READ_BUF(4); 3795 p = xdr_inline_decode(xdr, 4);
3637 READ32(len); 3796 if (unlikely(!p))
3797 goto out_overflow;
3798 len = be32_to_cpup(p);
3638 if (len > NFS4_FHSIZE) 3799 if (len > NFS4_FHSIZE)
3639 return -EIO; 3800 return -EIO;
3640 fh->size = len; 3801 fh->size = len;
3641 READ_BUF(len); 3802 p = xdr_inline_decode(xdr, len);
3642 COPYMEM(fh->data, len); 3803 if (unlikely(!p))
3804 goto out_overflow;
3805 memcpy(fh->data, p, len);
3643 return 0; 3806 return 0;
3807out_overflow:
3808 print_overflow_msg(__func__, xdr);
3809 return -EIO;
3644} 3810}
3645 3811
3646static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 3812static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3662 __be32 *p; 3828 __be32 *p;
3663 uint32_t namelen, type; 3829 uint32_t namelen, type;
3664 3830
3665 READ_BUF(32); 3831 p = xdr_inline_decode(xdr, 32);
3666 READ64(offset); 3832 if (unlikely(!p))
3667 READ64(length); 3833 goto out_overflow;
3668 READ32(type); 3834 p = xdr_decode_hyper(p, &offset);
3835 p = xdr_decode_hyper(p, &length);
3836 type = be32_to_cpup(p++);
3669 if (fl != NULL) { 3837 if (fl != NULL) {
3670 fl->fl_start = (loff_t)offset; 3838 fl->fl_start = (loff_t)offset;
3671 fl->fl_end = fl->fl_start + (loff_t)length - 1; 3839 fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
3676 fl->fl_type = F_RDLCK; 3844 fl->fl_type = F_RDLCK;
3677 fl->fl_pid = 0; 3845 fl->fl_pid = 0;
3678 } 3846 }
3679 READ64(clientid); 3847 p = xdr_decode_hyper(p, &clientid);
3680 READ32(namelen); 3848 namelen = be32_to_cpup(p);
3681 READ_BUF(namelen); 3849 p = xdr_inline_decode(xdr, namelen);
3682 return -NFS4ERR_DENIED; 3850 if (likely(p))
3851 return -NFS4ERR_DENIED;
3852out_overflow:
3853 print_overflow_msg(__func__, xdr);
3854 return -EIO;
3683} 3855}
3684 3856
3685static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) 3857static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
3686{ 3858{
3687 __be32 *p;
3688 int status; 3859 int status;
3689 3860
3690 status = decode_op_hdr(xdr, OP_LOCK); 3861 status = decode_op_hdr(xdr, OP_LOCK);
3691 if (status == -EIO) 3862 if (status == -EIO)
3692 goto out; 3863 goto out;
3693 if (status == 0) { 3864 if (status == 0) {
3694 READ_BUF(NFS4_STATEID_SIZE); 3865 status = decode_stateid(xdr, &res->stateid);
3695 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); 3866 if (unlikely(status))
3867 goto out;
3696 } else if (status == -NFS4ERR_DENIED) 3868 } else if (status == -NFS4ERR_DENIED)
3697 status = decode_lock_denied(xdr, NULL); 3869 status = decode_lock_denied(xdr, NULL);
3698 if (res->open_seqid != NULL) 3870 if (res->open_seqid != NULL)
@@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
3713 3885
3714static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) 3886static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
3715{ 3887{
3716 __be32 *p;
3717 int status; 3888 int status;
3718 3889
3719 status = decode_op_hdr(xdr, OP_LOCKU); 3890 status = decode_op_hdr(xdr, OP_LOCKU);
3720 if (status != -EIO) 3891 if (status != -EIO)
3721 nfs_increment_lock_seqid(status, res->seqid); 3892 nfs_increment_lock_seqid(status, res->seqid);
3722 if (status == 0) { 3893 if (status == 0)
3723 READ_BUF(NFS4_STATEID_SIZE); 3894 status = decode_stateid(xdr, &res->stateid);
3724 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3725 }
3726 return status; 3895 return status;
3727} 3896}
3728 3897
@@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
3737 __be32 *p; 3906 __be32 *p;
3738 uint32_t limit_type, nblocks, blocksize; 3907 uint32_t limit_type, nblocks, blocksize;
3739 3908
3740 READ_BUF(12); 3909 p = xdr_inline_decode(xdr, 12);
3741 READ32(limit_type); 3910 if (unlikely(!p))
3911 goto out_overflow;
3912 limit_type = be32_to_cpup(p++);
3742 switch (limit_type) { 3913 switch (limit_type) {
3743 case 1: 3914 case 1:
3744 READ64(*maxsize); 3915 xdr_decode_hyper(p, maxsize);
3745 break; 3916 break;
3746 case 2: 3917 case 2:
3747 READ32(nblocks); 3918 nblocks = be32_to_cpup(p++);
3748 READ32(blocksize); 3919 blocksize = be32_to_cpup(p);
3749 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; 3920 *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
3750 } 3921 }
3751 return 0; 3922 return 0;
3923out_overflow:
3924 print_overflow_msg(__func__, xdr);
3925 return -EIO;
3752} 3926}
3753 3927
3754static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) 3928static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3755{ 3929{
3756 __be32 *p; 3930 __be32 *p;
3757 uint32_t delegation_type; 3931 uint32_t delegation_type;
3932 int status;
3758 3933
3759 READ_BUF(4); 3934 p = xdr_inline_decode(xdr, 4);
3760 READ32(delegation_type); 3935 if (unlikely(!p))
3936 goto out_overflow;
3937 delegation_type = be32_to_cpup(p);
3761 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { 3938 if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
3762 res->delegation_type = 0; 3939 res->delegation_type = 0;
3763 return 0; 3940 return 0;
3764 } 3941 }
3765 READ_BUF(NFS4_STATEID_SIZE+4); 3942 status = decode_stateid(xdr, &res->delegation);
3766 COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); 3943 if (unlikely(status))
3767 READ32(res->do_recall); 3944 return status;
3945 p = xdr_inline_decode(xdr, 4);
3946 if (unlikely(!p))
3947 goto out_overflow;
3948 res->do_recall = be32_to_cpup(p);
3768 3949
3769 switch (delegation_type) { 3950 switch (delegation_type) {
3770 case NFS4_OPEN_DELEGATE_READ: 3951 case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
3776 return -EIO; 3957 return -EIO;
3777 } 3958 }
3778 return decode_ace(xdr, NULL, res->server->nfs_client); 3959 return decode_ace(xdr, NULL, res->server->nfs_client);
3960out_overflow:
3961 print_overflow_msg(__func__, xdr);
3962 return -EIO;
3779} 3963}
3780 3964
3781static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) 3965static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3787 status = decode_op_hdr(xdr, OP_OPEN); 3971 status = decode_op_hdr(xdr, OP_OPEN);
3788 if (status != -EIO) 3972 if (status != -EIO)
3789 nfs_increment_open_seqid(status, res->seqid); 3973 nfs_increment_open_seqid(status, res->seqid);
3790 if (status) 3974 if (!status)
3975 status = decode_stateid(xdr, &res->stateid);
3976 if (unlikely(status))
3791 return status; 3977 return status;
3792 READ_BUF(NFS4_STATEID_SIZE);
3793 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3794 3978
3795 decode_change_info(xdr, &res->cinfo); 3979 decode_change_info(xdr, &res->cinfo);
3796 3980
3797 READ_BUF(8); 3981 p = xdr_inline_decode(xdr, 8);
3798 READ32(res->rflags); 3982 if (unlikely(!p))
3799 READ32(bmlen); 3983 goto out_overflow;
3984 res->rflags = be32_to_cpup(p++);
3985 bmlen = be32_to_cpup(p);
3800 if (bmlen > 10) 3986 if (bmlen > 10)
3801 goto xdr_error; 3987 goto xdr_error;
3802 3988
3803 READ_BUF(bmlen << 2); 3989 p = xdr_inline_decode(xdr, bmlen << 2);
3990 if (unlikely(!p))
3991 goto out_overflow;
3804 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); 3992 savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
3805 for (i = 0; i < savewords; ++i) 3993 for (i = 0; i < savewords; ++i)
3806 READ32(res->attrset[i]); 3994 res->attrset[i] = be32_to_cpup(p++);
3807 for (; i < NFS4_BITMAP_SIZE; i++) 3995 for (; i < NFS4_BITMAP_SIZE; i++)
3808 res->attrset[i] = 0; 3996 res->attrset[i] = 0;
3809 3997
@@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
3811xdr_error: 3999xdr_error:
3812 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); 4000 dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
3813 return -EIO; 4001 return -EIO;
4002out_overflow:
4003 print_overflow_msg(__func__, xdr);
4004 return -EIO;
3814} 4005}
3815 4006
3816static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) 4007static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
3817{ 4008{
3818 __be32 *p;
3819 int status; 4009 int status;
3820 4010
3821 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); 4011 status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
3822 if (status != -EIO) 4012 if (status != -EIO)
3823 nfs_increment_open_seqid(status, res->seqid); 4013 nfs_increment_open_seqid(status, res->seqid);
3824 if (status) 4014 if (!status)
3825 return status; 4015 status = decode_stateid(xdr, &res->stateid);
3826 READ_BUF(NFS4_STATEID_SIZE); 4016 return status;
3827 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3828 return 0;
3829} 4017}
3830 4018
3831static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) 4019static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
3832{ 4020{
3833 __be32 *p;
3834 int status; 4021 int status;
3835 4022
3836 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); 4023 status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
3837 if (status != -EIO) 4024 if (status != -EIO)
3838 nfs_increment_open_seqid(status, res->seqid); 4025 nfs_increment_open_seqid(status, res->seqid);
3839 if (status) 4026 if (!status)
3840 return status; 4027 status = decode_stateid(xdr, &res->stateid);
3841 READ_BUF(NFS4_STATEID_SIZE); 4028 return status;
3842 COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
3843 return 0;
3844} 4029}
3845 4030
3846static int decode_putfh(struct xdr_stream *xdr) 4031static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3863 status = decode_op_hdr(xdr, OP_READ); 4048 status = decode_op_hdr(xdr, OP_READ);
3864 if (status) 4049 if (status)
3865 return status; 4050 return status;
3866 READ_BUF(8); 4051 p = xdr_inline_decode(xdr, 8);
3867 READ32(eof); 4052 if (unlikely(!p))
3868 READ32(count); 4053 goto out_overflow;
4054 eof = be32_to_cpup(p++);
4055 count = be32_to_cpup(p);
3869 hdrlen = (u8 *) p - (u8 *) iov->iov_base; 4056 hdrlen = (u8 *) p - (u8 *) iov->iov_base;
3870 recvd = req->rq_rcv_buf.len - hdrlen; 4057 recvd = req->rq_rcv_buf.len - hdrlen;
3871 if (count > recvd) { 4058 if (count > recvd) {
@@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
3878 res->eof = eof; 4065 res->eof = eof;
3879 res->count = count; 4066 res->count = count;
3880 return 0; 4067 return 0;
4068out_overflow:
4069 print_overflow_msg(__func__, xdr);
4070 return -EIO;
3881} 4071}
3882 4072
3883static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) 4073static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
3892 int status; 4082 int status;
3893 4083
3894 status = decode_op_hdr(xdr, OP_READDIR); 4084 status = decode_op_hdr(xdr, OP_READDIR);
3895 if (status) 4085 if (!status)
4086 status = decode_verifier(xdr, readdir->verifier.data);
4087 if (unlikely(status))
3896 return status; 4088 return status;
3897 READ_BUF(8);
3898 COPYMEM(readdir->verifier.data, 8);
3899 dprintk("%s: verifier = %08x:%08x\n", 4089 dprintk("%s: verifier = %08x:%08x\n",
3900 __func__, 4090 __func__,
3901 ((u32 *)readdir->verifier.data)[0], 4091 ((u32 *)readdir->verifier.data)[0],
3902 ((u32 *)readdir->verifier.data)[1]); 4092 ((u32 *)readdir->verifier.data)[1]);
3903 4093
3904 4094
3905 hdrlen = (char *) p - (char *) iov->iov_base; 4095 hdrlen = (char *) xdr->p - (char *) iov->iov_base;
3906 recvd = rcvbuf->len - hdrlen; 4096 recvd = rcvbuf->len - hdrlen;
3907 if (pglen > recvd) 4097 if (pglen > recvd)
3908 pglen = recvd; 4098 pglen = recvd;
@@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
3990 return status; 4180 return status;
3991 4181
3992 /* Convert length of symlink */ 4182 /* Convert length of symlink */
3993 READ_BUF(4); 4183 p = xdr_inline_decode(xdr, 4);
3994 READ32(len); 4184 if (unlikely(!p))
4185 goto out_overflow;
4186 len = be32_to_cpup(p);
3995 if (len >= rcvbuf->page_len || len <= 0) { 4187 if (len >= rcvbuf->page_len || len <= 0) {
3996 dprintk("nfs: server returned giant symlink!\n"); 4188 dprintk("nfs: server returned giant symlink!\n");
3997 return -ENAMETOOLONG; 4189 return -ENAMETOOLONG;
@@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
4015 kaddr[len+rcvbuf->page_base] = '\0'; 4207 kaddr[len+rcvbuf->page_base] = '\0';
4016 kunmap_atomic(kaddr, KM_USER0); 4208 kunmap_atomic(kaddr, KM_USER0);
4017 return 0; 4209 return 0;
4210out_overflow:
4211 print_overflow_msg(__func__, xdr);
4212 return -EIO;
4018} 4213}
4019 4214
4020static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) 4215static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr)
4112 status = decode_op_hdr(xdr, OP_SETATTR); 4307 status = decode_op_hdr(xdr, OP_SETATTR);
4113 if (status) 4308 if (status)
4114 return status; 4309 return status;
4115 READ_BUF(4); 4310 p = xdr_inline_decode(xdr, 4);
4116 READ32(bmlen); 4311 if (unlikely(!p))
4117 READ_BUF(bmlen << 2); 4312 goto out_overflow;
4118 return 0; 4313 bmlen = be32_to_cpup(p);
4314 p = xdr_inline_decode(xdr, bmlen << 2);
4315 if (likely(p))
4316 return 0;
4317out_overflow:
4318 print_overflow_msg(__func__, xdr);
4319 return -EIO;
4119} 4320}
4120 4321
4121static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) 4322static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
4124 uint32_t opnum; 4325 uint32_t opnum;
4125 int32_t nfserr; 4326 int32_t nfserr;
4126 4327
4127 READ_BUF(8); 4328 p = xdr_inline_decode(xdr, 8);
4128 READ32(opnum); 4329 if (unlikely(!p))
4330 goto out_overflow;
4331 opnum = be32_to_cpup(p++);
4129 if (opnum != OP_SETCLIENTID) { 4332 if (opnum != OP_SETCLIENTID) {
4130 dprintk("nfs: decode_setclientid: Server returned operation" 4333 dprintk("nfs: decode_setclientid: Server returned operation"
4131 " %d\n", opnum); 4334 " %d\n", opnum);
4132 return -EIO; 4335 return -EIO;
4133 } 4336 }
4134 READ32(nfserr); 4337 nfserr = be32_to_cpup(p);
4135 if (nfserr == NFS_OK) { 4338 if (nfserr == NFS_OK) {
4136 READ_BUF(8 + NFS4_VERIFIER_SIZE); 4339 p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
4137 READ64(clp->cl_clientid); 4340 if (unlikely(!p))
4138 COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); 4341 goto out_overflow;
4342 p = xdr_decode_hyper(p, &clp->cl_clientid);
4343 memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
4139 } else if (nfserr == NFSERR_CLID_INUSE) { 4344 } else if (nfserr == NFSERR_CLID_INUSE) {
4140 uint32_t len; 4345 uint32_t len;
4141 4346
4142 /* skip netid string */ 4347 /* skip netid string */
4143 READ_BUF(4); 4348 p = xdr_inline_decode(xdr, 4);
4144 READ32(len); 4349 if (unlikely(!p))
4145 READ_BUF(len); 4350 goto out_overflow;
4351 len = be32_to_cpup(p);
4352 p = xdr_inline_decode(xdr, len);
4353 if (unlikely(!p))
4354 goto out_overflow;
4146 4355
4147 /* skip uaddr string */ 4356 /* skip uaddr string */
4148 READ_BUF(4); 4357 p = xdr_inline_decode(xdr, 4);
4149 READ32(len); 4358 if (unlikely(!p))
4150 READ_BUF(len); 4359 goto out_overflow;
4360 len = be32_to_cpup(p);
4361 p = xdr_inline_decode(xdr, len);
4362 if (unlikely(!p))
4363 goto out_overflow;
4151 return -NFSERR_CLID_INUSE; 4364 return -NFSERR_CLID_INUSE;
4152 } else 4365 } else
4153 return nfs4_stat_to_errno(nfserr); 4366 return nfs4_stat_to_errno(nfserr);
4154 4367
4155 return 0; 4368 return 0;
4369out_overflow:
4370 print_overflow_msg(__func__, xdr);
4371 return -EIO;
4156} 4372}
4157 4373
4158static int decode_setclientid_confirm(struct xdr_stream *xdr) 4374static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
4169 if (status) 4385 if (status)
4170 return status; 4386 return status;
4171 4387
4172 READ_BUF(16); 4388 p = xdr_inline_decode(xdr, 16);
4173 READ32(res->count); 4389 if (unlikely(!p))
4174 READ32(res->verf->committed); 4390 goto out_overflow;
4175 COPYMEM(res->verf->verifier, 8); 4391 res->count = be32_to_cpup(p++);
4392 res->verf->committed = be32_to_cpup(p++);
4393 memcpy(res->verf->verifier, p, 8);
4176 return 0; 4394 return 0;
4395out_overflow:
4396 print_overflow_msg(__func__, xdr);
4397 return -EIO;
4177} 4398}
4178 4399
4179static int decode_delegreturn(struct xdr_stream *xdr) 4400static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4187{ 4408{
4188 __be32 *p; 4409 __be32 *p;
4189 uint32_t dummy; 4410 uint32_t dummy;
4411 char *dummy_str;
4190 int status; 4412 int status;
4191 struct nfs_client *clp = res->client; 4413 struct nfs_client *clp = res->client;
4192 4414
@@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4194 if (status) 4416 if (status)
4195 return status; 4417 return status;
4196 4418
4197 READ_BUF(8); 4419 p = xdr_inline_decode(xdr, 8);
4198 READ64(clp->cl_ex_clid); 4420 if (unlikely(!p))
4199 READ_BUF(12); 4421 goto out_overflow;
4200 READ32(clp->cl_seqid); 4422 xdr_decode_hyper(p, &clp->cl_ex_clid);
4201 READ32(clp->cl_exchange_flags); 4423 p = xdr_inline_decode(xdr, 12);
4424 if (unlikely(!p))
4425 goto out_overflow;
4426 clp->cl_seqid = be32_to_cpup(p++);
4427 clp->cl_exchange_flags = be32_to_cpup(p++);
4202 4428
4203 /* We ask for SP4_NONE */ 4429 /* We ask for SP4_NONE */
4204 READ32(dummy); 4430 dummy = be32_to_cpup(p);
4205 if (dummy != SP4_NONE) 4431 if (dummy != SP4_NONE)
4206 return -EIO; 4432 return -EIO;
4207 4433
4208 /* Throw away minor_id */ 4434 /* Throw away minor_id */
4209 READ_BUF(8); 4435 p = xdr_inline_decode(xdr, 8);
4436 if (unlikely(!p))
4437 goto out_overflow;
4210 4438
4211 /* Throw away Major id */ 4439 /* Throw away Major id */
4212 READ_BUF(4); 4440 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4213 READ32(dummy); 4441 if (unlikely(status))
4214 READ_BUF(dummy); 4442 return status;
4215 4443
4216 /* Throw away server_scope */ 4444 /* Throw away server_scope */
4217 READ_BUF(4); 4445 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4218 READ32(dummy); 4446 if (unlikely(status))
4219 READ_BUF(dummy); 4447 return status;
4220 4448
4221 /* Throw away Implementation id array */ 4449 /* Throw away Implementation id array */
4222 READ_BUF(4); 4450 status = decode_opaque_inline(xdr, &dummy, &dummy_str);
4223 READ32(dummy); 4451 if (unlikely(status))
4224 READ_BUF(dummy); 4452 return status;
4225 4453
4226 return 0; 4454 return 0;
4455out_overflow:
4456 print_overflow_msg(__func__, xdr);
4457 return -EIO;
4227} 4458}
4228 4459
4229static int decode_chan_attrs(struct xdr_stream *xdr, 4460static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
4232 __be32 *p; 4463 __be32 *p;
4233 u32 nr_attrs; 4464 u32 nr_attrs;
4234 4465
4235 READ_BUF(28); 4466 p = xdr_inline_decode(xdr, 28);
4236 READ32(attrs->headerpadsz); 4467 if (unlikely(!p))
4237 READ32(attrs->max_rqst_sz); 4468 goto out_overflow;
4238 READ32(attrs->max_resp_sz); 4469 attrs->headerpadsz = be32_to_cpup(p++);
4239 READ32(attrs->max_resp_sz_cached); 4470 attrs->max_rqst_sz = be32_to_cpup(p++);
4240 READ32(attrs->max_ops); 4471 attrs->max_resp_sz = be32_to_cpup(p++);
4241 READ32(attrs->max_reqs); 4472 attrs->max_resp_sz_cached = be32_to_cpup(p++);
4242 READ32(nr_attrs); 4473 attrs->max_ops = be32_to_cpup(p++);
4474 attrs->max_reqs = be32_to_cpup(p++);
4475 nr_attrs = be32_to_cpup(p);
4243 if (unlikely(nr_attrs > 1)) { 4476 if (unlikely(nr_attrs > 1)) {
4244 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", 4477 printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
4245 __func__, nr_attrs); 4478 __func__, nr_attrs);
4246 return -EINVAL; 4479 return -EINVAL;
4247 } 4480 }
4248 if (nr_attrs == 1) 4481 if (nr_attrs == 1) {
4249 READ_BUF(4); /* skip rdma_attrs */ 4482 p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
4483 if (unlikely(!p))
4484 goto out_overflow;
4485 }
4250 return 0; 4486 return 0;
4487out_overflow:
4488 print_overflow_msg(__func__, xdr);
4489 return -EIO;
4490}
4491
4492static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
4493{
4494 return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
4251} 4495}
4252 4496
4253static int decode_create_session(struct xdr_stream *xdr, 4497static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr,
4259 struct nfs4_session *session = clp->cl_session; 4503 struct nfs4_session *session = clp->cl_session;
4260 4504
4261 status = decode_op_hdr(xdr, OP_CREATE_SESSION); 4505 status = decode_op_hdr(xdr, OP_CREATE_SESSION);
4262 4506 if (!status)
4263 if (status) 4507 status = decode_sessionid(xdr, &session->sess_id);
4508 if (unlikely(status))
4264 return status; 4509 return status;
4265 4510
4266 /* sessionid */
4267 READ_BUF(NFS4_MAX_SESSIONID_LEN);
4268 COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
4269
4270 /* seqid, flags */ 4511 /* seqid, flags */
4271 READ_BUF(8); 4512 p = xdr_inline_decode(xdr, 8);
4272 READ32(clp->cl_seqid); 4513 if (unlikely(!p))
4273 READ32(session->flags); 4514 goto out_overflow;
4515 clp->cl_seqid = be32_to_cpup(p++);
4516 session->flags = be32_to_cpup(p);
4274 4517
4275 /* Channel attributes */ 4518 /* Channel attributes */
4276 status = decode_chan_attrs(xdr, &session->fc_attrs); 4519 status = decode_chan_attrs(xdr, &session->fc_attrs);
4277 if (!status) 4520 if (!status)
4278 status = decode_chan_attrs(xdr, &session->bc_attrs); 4521 status = decode_chan_attrs(xdr, &session->bc_attrs);
4279 return status; 4522 return status;
4523out_overflow:
4524 print_overflow_msg(__func__, xdr);
4525 return -EIO;
4280} 4526}
4281 4527
4282static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) 4528static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr,
4300 return 0; 4546 return 0;
4301 4547
4302 status = decode_op_hdr(xdr, OP_SEQUENCE); 4548 status = decode_op_hdr(xdr, OP_SEQUENCE);
4303 if (status) 4549 if (!status)
4550 status = decode_sessionid(xdr, &id);
4551 if (unlikely(status))
4304 goto out_err; 4552 goto out_err;
4305 4553
4306 /* 4554 /*
@@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr,
4309 */ 4557 */
4310 status = -ESERVERFAULT; 4558 status = -ESERVERFAULT;
4311 4559
4312 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4313 READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
4314 COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
4315 if (memcmp(id.data, res->sr_session->sess_id.data, 4560 if (memcmp(id.data, res->sr_session->sess_id.data,
4316 NFS4_MAX_SESSIONID_LEN)) { 4561 NFS4_MAX_SESSIONID_LEN)) {
4317 dprintk("%s Invalid session id\n", __func__); 4562 dprintk("%s Invalid session id\n", __func__);
4318 goto out_err; 4563 goto out_err;
4319 } 4564 }
4565
4566 p = xdr_inline_decode(xdr, 20);
4567 if (unlikely(!p))
4568 goto out_overflow;
4569
4320 /* seqid */ 4570 /* seqid */
4321 READ32(dummy); 4571 slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
4572 dummy = be32_to_cpup(p++);
4322 if (dummy != slot->seq_nr) { 4573 if (dummy != slot->seq_nr) {
4323 dprintk("%s Invalid sequence number\n", __func__); 4574 dprintk("%s Invalid sequence number\n", __func__);
4324 goto out_err; 4575 goto out_err;
4325 } 4576 }
4326 /* slot id */ 4577 /* slot id */
4327 READ32(dummy); 4578 dummy = be32_to_cpup(p++);
4328 if (dummy != res->sr_slotid) { 4579 if (dummy != res->sr_slotid) {
4329 dprintk("%s Invalid slot id\n", __func__); 4580 dprintk("%s Invalid slot id\n", __func__);
4330 goto out_err; 4581 goto out_err;
4331 } 4582 }
4332 /* highest slot id - currently not processed */ 4583 /* highest slot id - currently not processed */
4333 READ32(dummy); 4584 dummy = be32_to_cpup(p++);
4334 /* target highest slot id - currently not processed */ 4585 /* target highest slot id - currently not processed */
4335 READ32(dummy); 4586 dummy = be32_to_cpup(p++);
4336 /* result flags - currently not processed */ 4587 /* result flags - currently not processed */
4337 READ32(dummy); 4588 dummy = be32_to_cpup(p);
4338 status = 0; 4589 status = 0;
4339out_err: 4590out_err:
4340 res->sr_status = status; 4591 res->sr_status = status;
4341 return status; 4592 return status;
4593out_overflow:
4594 print_overflow_msg(__func__, xdr);
4595 status = -EIO;
4596 goto out_err;
4342#else /* CONFIG_NFS_V4_1 */ 4597#else /* CONFIG_NFS_V4_1 */
4343 return 0; 4598 return 0;
4344#endif /* CONFIG_NFS_V4_1 */ 4599#endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
4370 status = decode_open_downgrade(&xdr, res); 4625 status = decode_open_downgrade(&xdr, res);
4371 if (status != 0) 4626 if (status != 0)
4372 goto out; 4627 goto out;
4373 decode_getfattr(&xdr, res->fattr, res->server); 4628 decode_getfattr(&xdr, res->fattr, res->server,
4629 !RPC_IS_ASYNC(rqstp->rq_task));
4374out: 4630out:
4375 return status; 4631 return status;
4376} 4632}
@@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
4397 status = decode_access(&xdr, res); 4653 status = decode_access(&xdr, res);
4398 if (status != 0) 4654 if (status != 0)
4399 goto out; 4655 goto out;
4400 decode_getfattr(&xdr, res->fattr, res->server); 4656 decode_getfattr(&xdr, res->fattr, res->server,
4657 !RPC_IS_ASYNC(rqstp->rq_task));
4401out: 4658out:
4402 return status; 4659 return status;
4403} 4660}
@@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
4424 goto out; 4681 goto out;
4425 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4682 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4426 goto out; 4683 goto out;
4427 status = decode_getfattr(&xdr, res->fattr, res->server); 4684 status = decode_getfattr(&xdr, res->fattr, res->server
4685 ,!RPC_IS_ASYNC(rqstp->rq_task));
4428out: 4686out:
4429 return status; 4687 return status;
4430} 4688}
@@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
4448 if ((status = decode_putrootfh(&xdr)) != 0) 4706 if ((status = decode_putrootfh(&xdr)) != 0)
4449 goto out; 4707 goto out;
4450 if ((status = decode_getfh(&xdr, res->fh)) == 0) 4708 if ((status = decode_getfh(&xdr, res->fh)) == 0)
4451 status = decode_getfattr(&xdr, res->fattr, res->server); 4709 status = decode_getfattr(&xdr, res->fattr, res->server,
4710 !RPC_IS_ASYNC(rqstp->rq_task));
4452out: 4711out:
4453 return status; 4712 return status;
4454} 4713}
@@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
4473 goto out; 4732 goto out;
4474 if ((status = decode_remove(&xdr, &res->cinfo)) != 0) 4733 if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
4475 goto out; 4734 goto out;
4476 decode_getfattr(&xdr, &res->dir_attr, res->server); 4735 decode_getfattr(&xdr, &res->dir_attr, res->server,
4736 !RPC_IS_ASYNC(rqstp->rq_task));
4477out: 4737out:
4478 return status; 4738 return status;
4479} 4739}
@@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
4503 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) 4763 if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
4504 goto out; 4764 goto out;
4505 /* Current FH is target directory */ 4765 /* Current FH is target directory */
4506 if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) 4766 if (decode_getfattr(&xdr, res->new_fattr, res->server,
4767 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4507 goto out; 4768 goto out;
4508 if ((status = decode_restorefh(&xdr)) != 0) 4769 if ((status = decode_restorefh(&xdr)) != 0)
4509 goto out; 4770 goto out;
4510 decode_getfattr(&xdr, res->old_fattr, res->server); 4771 decode_getfattr(&xdr, res->old_fattr, res->server,
4772 !RPC_IS_ASYNC(rqstp->rq_task));
4511out: 4773out:
4512 return status; 4774 return status;
4513} 4775}
@@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
4540 * Note order: OP_LINK leaves the directory as the current 4802 * Note order: OP_LINK leaves the directory as the current
4541 * filehandle. 4803 * filehandle.
4542 */ 4804 */
4543 if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) 4805 if (decode_getfattr(&xdr, res->dir_attr, res->server,
4806 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4544 goto out; 4807 goto out;
4545 if ((status = decode_restorefh(&xdr)) != 0) 4808 if ((status = decode_restorefh(&xdr)) != 0)
4546 goto out; 4809 goto out;
4547 decode_getfattr(&xdr, res->fattr, res->server); 4810 decode_getfattr(&xdr, res->fattr, res->server,
4811 !RPC_IS_ASYNC(rqstp->rq_task));
4548out: 4812out:
4549 return status; 4813 return status;
4550} 4814}
@@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
4573 goto out; 4837 goto out;
4574 if ((status = decode_getfh(&xdr, res->fh)) != 0) 4838 if ((status = decode_getfh(&xdr, res->fh)) != 0)
4575 goto out; 4839 goto out;
4576 if (decode_getfattr(&xdr, res->fattr, res->server) != 0) 4840 if (decode_getfattr(&xdr, res->fattr, res->server,
4841 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4577 goto out; 4842 goto out;
4578 if ((status = decode_restorefh(&xdr)) != 0) 4843 if ((status = decode_restorefh(&xdr)) != 0)
4579 goto out; 4844 goto out;
4580 decode_getfattr(&xdr, res->dir_fattr, res->server); 4845 decode_getfattr(&xdr, res->dir_fattr, res->server,
4846 !RPC_IS_ASYNC(rqstp->rq_task));
4581out: 4847out:
4582 return status; 4848 return status;
4583} 4849}
@@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
4609 status = decode_putfh(&xdr); 4875 status = decode_putfh(&xdr);
4610 if (status) 4876 if (status)
4611 goto out; 4877 goto out;
4612 status = decode_getfattr(&xdr, res->fattr, res->server); 4878 status = decode_getfattr(&xdr, res->fattr, res->server,
4879 !RPC_IS_ASYNC(rqstp->rq_task));
4613out: 4880out:
4614 return status; 4881 return status;
4615} 4882}
@@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
4716 * an ESTALE error. Shouldn't be a problem, 4983 * an ESTALE error. Shouldn't be a problem,
4717 * though, since fattr->valid will remain unset. 4984 * though, since fattr->valid will remain unset.
4718 */ 4985 */
4719 decode_getfattr(&xdr, res->fattr, res->server); 4986 decode_getfattr(&xdr, res->fattr, res->server,
4987 !RPC_IS_ASYNC(rqstp->rq_task));
4720out: 4988out:
4721 return status; 4989 return status;
4722} 4990}
@@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
4748 goto out; 5016 goto out;
4749 if (decode_getfh(&xdr, &res->fh) != 0) 5017 if (decode_getfh(&xdr, &res->fh) != 0)
4750 goto out; 5018 goto out;
4751 if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) 5019 if (decode_getfattr(&xdr, res->f_attr, res->server,
5020 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
4752 goto out; 5021 goto out;
4753 if (decode_restorefh(&xdr) != 0) 5022 if (decode_restorefh(&xdr) != 0)
4754 goto out; 5023 goto out;
4755 decode_getfattr(&xdr, res->dir_attr, res->server); 5024 decode_getfattr(&xdr, res->dir_attr, res->server,
5025 !RPC_IS_ASYNC(rqstp->rq_task));
4756out: 5026out:
4757 return status; 5027 return status;
4758} 5028}
@@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
4800 status = decode_open(&xdr, res); 5070 status = decode_open(&xdr, res);
4801 if (status) 5071 if (status)
4802 goto out; 5072 goto out;
4803 decode_getfattr(&xdr, res->f_attr, res->server); 5073 decode_getfattr(&xdr, res->f_attr, res->server,
5074 !RPC_IS_ASYNC(rqstp->rq_task));
4804out: 5075out:
4805 return status; 5076 return status;
4806} 5077}
@@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
4827 status = decode_setattr(&xdr); 5098 status = decode_setattr(&xdr);
4828 if (status) 5099 if (status)
4829 goto out; 5100 goto out;
4830 decode_getfattr(&xdr, res->fattr, res->server); 5101 decode_getfattr(&xdr, res->fattr, res->server,
5102 !RPC_IS_ASYNC(rqstp->rq_task));
4831out: 5103out:
4832 return status; 5104 return status;
4833} 5105}
@@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
5001 status = decode_write(&xdr, res); 5273 status = decode_write(&xdr, res);
5002 if (status) 5274 if (status)
5003 goto out; 5275 goto out;
5004 decode_getfattr(&xdr, res->fattr, res->server); 5276 decode_getfattr(&xdr, res->fattr, res->server,
5277 !RPC_IS_ASYNC(rqstp->rq_task));
5005 if (!status) 5278 if (!status)
5006 status = res->count; 5279 status = res->count;
5007out: 5280out:
@@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
5030 status = decode_commit(&xdr, res); 5303 status = decode_commit(&xdr, res);
5031 if (status) 5304 if (status)
5032 goto out; 5305 goto out;
5033 decode_getfattr(&xdr, res->fattr, res->server); 5306 decode_getfattr(&xdr, res->fattr, res->server,
5307 !RPC_IS_ASYNC(rqstp->rq_task));
5034out: 5308out:
5035 return status; 5309 return status;
5036} 5310}
@@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5194 if (status != 0) 5468 if (status != 0)
5195 goto out; 5469 goto out;
5196 status = decode_delegreturn(&xdr); 5470 status = decode_delegreturn(&xdr);
5197 decode_getfattr(&xdr, res->fattr, res->server); 5471 decode_getfattr(&xdr, res->fattr, res->server,
5472 !RPC_IS_ASYNC(rqstp->rq_task));
5198out: 5473out:
5199 return status; 5474 return status;
5200} 5475}
@@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
5222 goto out; 5497 goto out;
5223 xdr_enter_page(&xdr, PAGE_SIZE); 5498 xdr_enter_page(&xdr, PAGE_SIZE);
5224 status = decode_getfattr(&xdr, &res->fs_locations->fattr, 5499 status = decode_getfattr(&xdr, &res->fs_locations->fattr,
5225 res->fs_locations->server); 5500 res->fs_locations->server,
5501 !RPC_IS_ASYNC(req->rq_task));
5226out: 5502out:
5227 return status; 5503 return status;
5228} 5504}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
73 Opt_cto, Opt_nocto, 73 Opt_cto, Opt_nocto,
74 Opt_ac, Opt_noac, 74 Opt_ac, Opt_noac,
75 Opt_lock, Opt_nolock, 75 Opt_lock, Opt_nolock,
76 Opt_v2, Opt_v3, 76 Opt_v2, Opt_v3, Opt_v4,
77 Opt_udp, Opt_tcp, Opt_rdma, 77 Opt_udp, Opt_tcp, Opt_rdma,
78 Opt_acl, Opt_noacl, 78 Opt_acl, Opt_noacl,
79 Opt_rdirplus, Opt_nordirplus, 79 Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
127 { Opt_nolock, "nolock" }, 127 { Opt_nolock, "nolock" },
128 { Opt_v2, "v2" }, 128 { Opt_v2, "v2" },
129 { Opt_v3, "v3" }, 129 { Opt_v3, "v3" },
130 { Opt_v4, "v4" },
130 { Opt_udp, "udp" }, 131 { Opt_udp, "udp" },
131 { Opt_tcp, "tcp" }, 132 { Opt_tcp, "tcp" },
132 { Opt_rdma, "rdma" }, 133 { Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
158 { Opt_mountvers, "mountvers=%s" }, 159 { Opt_mountvers, "mountvers=%s" },
159 { Opt_nfsvers, "nfsvers=%s" }, 160 { Opt_nfsvers, "nfsvers=%s" },
160 { Opt_nfsvers, "vers=%s" }, 161 { Opt_nfsvers, "vers=%s" },
161 { Opt_minorversion, "minorversion=%u" }, 162 { Opt_minorversion, "minorversion=%s" },
162 163
163 { Opt_sec, "sec=%s" }, 164 { Opt_sec, "sec=%s" },
164 { Opt_proto, "proto=%s" }, 165 { Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
272}; 273};
273 274
274#ifdef CONFIG_NFS_V4 275#ifdef CONFIG_NFS_V4
276static int nfs4_validate_text_mount_data(void *options,
277 struct nfs_parsed_mount_data *args, const char *dev_name);
278static int nfs4_try_mount(int flags, const char *dev_name,
279 struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
275static int nfs4_get_sb(struct file_system_type *fs_type, 280static int nfs4_get_sb(struct file_system_type *fs_type,
276 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 281 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
277static int nfs4_remote_get_sb(struct file_system_type *fs_type, 282static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr)
742 } 747 }
743 } 748 }
744 749
750 dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
745 return 0; 751 return 0;
746} 752}
747 753
748static void nfs_parse_ipv4_address(char *string, size_t str_len,
749 struct sockaddr *sap, size_t *addr_len)
750{
751 struct sockaddr_in *sin = (struct sockaddr_in *)sap;
752 u8 *addr = (u8 *)&sin->sin_addr.s_addr;
753
754 if (str_len <= INET_ADDRSTRLEN) {
755 dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
756 (int)str_len, string);
757
758 sin->sin_family = AF_INET;
759 *addr_len = sizeof(*sin);
760 if (in4_pton(string, str_len, addr, '\0', NULL))
761 return;
762 }
763
764 sap->sa_family = AF_UNSPEC;
765 *addr_len = 0;
766}
767
768#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
769static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
770 const char *delim,
771 struct sockaddr_in6 *sin6)
772{
773 char *p;
774 size_t len;
775
776 if ((string + str_len) == delim)
777 return 1;
778
779 if (*delim != IPV6_SCOPE_DELIMITER)
780 return 0;
781
782 if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
783 return 0;
784
785 len = (string + str_len) - delim - 1;
786 p = kstrndup(delim + 1, len, GFP_KERNEL);
787 if (p) {
788 unsigned long scope_id = 0;
789 struct net_device *dev;
790
791 dev = dev_get_by_name(&init_net, p);
792 if (dev != NULL) {
793 scope_id = dev->ifindex;
794 dev_put(dev);
795 } else {
796 if (strict_strtoul(p, 10, &scope_id) == 0) {
797 kfree(p);
798 return 0;
799 }
800 }
801
802 kfree(p);
803
804 sin6->sin6_scope_id = scope_id;
805 dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
806 return 1;
807 }
808
809 return 0;
810}
811
812static void nfs_parse_ipv6_address(char *string, size_t str_len,
813 struct sockaddr *sap, size_t *addr_len)
814{
815 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
816 u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
817 const char *delim;
818
819 if (str_len <= INET6_ADDRSTRLEN) {
820 dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
821 (int)str_len, string);
822
823 sin6->sin6_family = AF_INET6;
824 *addr_len = sizeof(*sin6);
825 if (in6_pton(string, str_len, addr,
826 IPV6_SCOPE_DELIMITER, &delim) != 0) {
827 if (nfs_parse_ipv6_scope_id(string, str_len,
828 delim, sin6) != 0)
829 return;
830 }
831 }
832
833 sap->sa_family = AF_UNSPEC;
834 *addr_len = 0;
835}
836#else
837static void nfs_parse_ipv6_address(char *string, size_t str_len,
838 struct sockaddr *sap, size_t *addr_len)
839{
840 sap->sa_family = AF_UNSPEC;
841 *addr_len = 0;
842}
843#endif
844
845/* 754/*
846 * Construct a sockaddr based on the contents of a string that contains 755 * Select between a default port value and a user-specified port value.
847 * an IP address in presentation format. 756 * If a zero value is set, then autobind will be used.
848 *
849 * If there is a problem constructing the new sockaddr, set the address
850 * family to AF_UNSPEC.
851 */ 757 */
852void nfs_parse_ip_address(char *string, size_t str_len, 758static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
853 struct sockaddr *sap, size_t *addr_len) 759 const unsigned short default_port)
854{ 760{
855 unsigned int i, colons; 761 unsigned short port = default_port;
856 762
857 colons = 0; 763 if (parsed_port != NFS_UNSPEC_PORT)
858 for (i = 0; i < str_len; i++) 764 port = parsed_port;
859 if (string[i] == ':')
860 colons++;
861 765
862 if (colons >= 2) 766 rpc_set_port(sap, port);
863 nfs_parse_ipv6_address(string, str_len, sap, addr_len);
864 else
865 nfs_parse_ipv4_address(string, str_len, sap, addr_len);
866} 767}
867 768
868/* 769/*
@@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
904 805
905/* 806/*
906 * Parse the value of the 'sec=' option. 807 * Parse the value of the 'sec=' option.
907 *
908 * The flavor_len setting is for v4 mounts.
909 */ 808 */
910static int nfs_parse_security_flavors(char *value, 809static int nfs_parse_security_flavors(char *value,
911 struct nfs_parsed_mount_data *mnt) 810 struct nfs_parsed_mount_data *mnt)
@@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value,
916 815
917 switch (match_token(value, nfs_secflavor_tokens, args)) { 816 switch (match_token(value, nfs_secflavor_tokens, args)) {
918 case Opt_sec_none: 817 case Opt_sec_none:
919 mnt->auth_flavor_len = 0;
920 mnt->auth_flavors[0] = RPC_AUTH_NULL; 818 mnt->auth_flavors[0] = RPC_AUTH_NULL;
921 break; 819 break;
922 case Opt_sec_sys: 820 case Opt_sec_sys:
923 mnt->auth_flavor_len = 0;
924 mnt->auth_flavors[0] = RPC_AUTH_UNIX; 821 mnt->auth_flavors[0] = RPC_AUTH_UNIX;
925 break; 822 break;
926 case Opt_sec_krb5: 823 case Opt_sec_krb5:
927 mnt->auth_flavor_len = 1;
928 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; 824 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
929 break; 825 break;
930 case Opt_sec_krb5i: 826 case Opt_sec_krb5i:
931 mnt->auth_flavor_len = 1;
932 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; 827 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
933 break; 828 break;
934 case Opt_sec_krb5p: 829 case Opt_sec_krb5p:
935 mnt->auth_flavor_len = 1;
936 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; 830 mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
937 break; 831 break;
938 case Opt_sec_lkey: 832 case Opt_sec_lkey:
939 mnt->auth_flavor_len = 1;
940 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; 833 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
941 break; 834 break;
942 case Opt_sec_lkeyi: 835 case Opt_sec_lkeyi:
943 mnt->auth_flavor_len = 1;
944 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; 836 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
945 break; 837 break;
946 case Opt_sec_lkeyp: 838 case Opt_sec_lkeyp:
947 mnt->auth_flavor_len = 1;
948 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; 839 mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
949 break; 840 break;
950 case Opt_sec_spkm: 841 case Opt_sec_spkm:
951 mnt->auth_flavor_len = 1;
952 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; 842 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
953 break; 843 break;
954 case Opt_sec_spkmi: 844 case Opt_sec_spkmi:
955 mnt->auth_flavor_len = 1;
956 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; 845 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
957 break; 846 break;
958 case Opt_sec_spkmp: 847 case Opt_sec_spkmp:
959 mnt->auth_flavor_len = 1;
960 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; 848 mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
961 break; 849 break;
962 default: 850 default:
963 return 0; 851 return 0;
964 } 852 }
965 853
854 mnt->auth_flavor_len = 1;
966 return 1; 855 return 1;
967} 856}
968 857
@@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw,
1001 while ((p = strsep(&raw, ",")) != NULL) { 890 while ((p = strsep(&raw, ",")) != NULL) {
1002 substring_t args[MAX_OPT_ARGS]; 891 substring_t args[MAX_OPT_ARGS];
1003 unsigned long option; 892 unsigned long option;
1004 int int_option;
1005 int token; 893 int token;
1006 894
1007 if (!*p) 895 if (!*p)
@@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
1047 break; 935 break;
1048 case Opt_v2: 936 case Opt_v2:
1049 mnt->flags &= ~NFS_MOUNT_VER3; 937 mnt->flags &= ~NFS_MOUNT_VER3;
938 mnt->version = 2;
1050 break; 939 break;
1051 case Opt_v3: 940 case Opt_v3:
1052 mnt->flags |= NFS_MOUNT_VER3; 941 mnt->flags |= NFS_MOUNT_VER3;
942 mnt->version = 3;
1053 break; 943 break;
944#ifdef CONFIG_NFS_V4
945 case Opt_v4:
946 mnt->flags &= ~NFS_MOUNT_VER3;
947 mnt->version = 4;
948 break;
949#endif
1054 case Opt_udp: 950 case Opt_udp:
1055 mnt->flags &= ~NFS_MOUNT_TCP; 951 mnt->flags &= ~NFS_MOUNT_TCP;
1056 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 952 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw,
1264 switch (option) { 1160 switch (option) {
1265 case NFS2_VERSION: 1161 case NFS2_VERSION:
1266 mnt->flags &= ~NFS_MOUNT_VER3; 1162 mnt->flags &= ~NFS_MOUNT_VER3;
1163 mnt->version = 2;
1267 break; 1164 break;
1268 case NFS3_VERSION: 1165 case NFS3_VERSION:
1269 mnt->flags |= NFS_MOUNT_VER3; 1166 mnt->flags |= NFS_MOUNT_VER3;
1167 mnt->version = 3;
1270 break; 1168 break;
1169#ifdef CONFIG_NFS_V4
1170 case NFS4_VERSION:
1171 mnt->flags &= ~NFS_MOUNT_VER3;
1172 mnt->version = 4;
1173 break;
1174#endif
1271 default: 1175 default:
1272 goto out_invalid_value; 1176 goto out_invalid_value;
1273 } 1177 }
1274 break; 1178 break;
1275 case Opt_minorversion: 1179 case Opt_minorversion:
1276 if (match_int(args, &int_option)) 1180 string = match_strdup(args);
1277 return 0; 1181 if (string == NULL)
1278 if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) 1182 goto out_nomem;
1279 return 0; 1183 rc = strict_strtoul(string, 10, &option);
1280 mnt->minorversion = int_option; 1184 kfree(string);
1185 if (rc != 0)
1186 goto out_invalid_value;
1187 if (option > NFS4_MAX_MINOR_VERSION)
1188 goto out_invalid_value;
1189 mnt->minorversion = option;
1281 break; 1190 break;
1282 1191
1283 /* 1192 /*
@@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw,
1352 string = match_strdup(args); 1261 string = match_strdup(args);
1353 if (string == NULL) 1262 if (string == NULL)
1354 goto out_nomem; 1263 goto out_nomem;
1355 nfs_parse_ip_address(string, strlen(string), 1264 mnt->nfs_server.addrlen =
1356 (struct sockaddr *) 1265 rpc_pton(string, strlen(string),
1357 &mnt->nfs_server.address, 1266 (struct sockaddr *)
1358 &mnt->nfs_server.addrlen); 1267 &mnt->nfs_server.address,
1268 sizeof(mnt->nfs_server.address));
1359 kfree(string); 1269 kfree(string);
1270 if (mnt->nfs_server.addrlen == 0)
1271 goto out_invalid_address;
1360 break; 1272 break;
1361 case Opt_clientaddr: 1273 case Opt_clientaddr:
1362 string = match_strdup(args); 1274 string = match_strdup(args);
@@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw,
1376 string = match_strdup(args); 1288 string = match_strdup(args);
1377 if (string == NULL) 1289 if (string == NULL)
1378 goto out_nomem; 1290 goto out_nomem;
1379 nfs_parse_ip_address(string, strlen(string), 1291 mnt->mount_server.addrlen =
1380 (struct sockaddr *) 1292 rpc_pton(string, strlen(string),
1381 &mnt->mount_server.address, 1293 (struct sockaddr *)
1382 &mnt->mount_server.addrlen); 1294 &mnt->mount_server.address,
1295 sizeof(mnt->mount_server.address));
1383 kfree(string); 1296 kfree(string);
1297 if (mnt->mount_server.addrlen == 0)
1298 goto out_invalid_address;
1384 break; 1299 break;
1385 case Opt_lookupcache: 1300 case Opt_lookupcache:
1386 string = match_strdup(args); 1301 string = match_strdup(args);
@@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw,
1432 1347
1433 return 1; 1348 return 1;
1434 1349
1350out_invalid_address:
1351 printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
1352 return 0;
1435out_invalid_value: 1353out_invalid_value:
1436 printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); 1354 printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
1437 return 0; 1355 return 0;
1438out_nomem: 1356out_nomem:
1439 printk(KERN_INFO "NFS: not enough memory to parse option\n"); 1357 printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1363,60 @@ out_security_failure:
1445} 1363}
1446 1364
1447/* 1365/*
1366 * Match the requested auth flavors with the list returned by
1367 * the server. Returns zero and sets the mount's authentication
1368 * flavor on success; returns -EACCES if server does not support
1369 * the requested flavor.
1370 */
1371static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
1372 struct nfs_mount_request *request)
1373{
1374 unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
1375
1376 /*
1377 * Certain releases of Linux's mountd return an empty
1378 * flavor list. To prevent behavioral regression with
1379 * these servers (ie. rejecting mounts that used to
1380 * succeed), revert to pre-2.6.32 behavior (no checking)
1381 * if the returned flavor list is empty.
1382 */
1383 if (server_authlist_len == 0)
1384 return 0;
1385
1386 /*
1387 * We avoid sophisticated negotiating here, as there are
1388 * plenty of cases where we can get it wrong, providing
1389 * either too little or too much security.
1390 *
1391 * RFC 2623, section 2.7 suggests we SHOULD prefer the
1392 * flavor listed first. However, some servers list
1393 * AUTH_NULL first. Our caller plants AUTH_SYS, the
1394 * preferred default, in args->auth_flavors[0] if user
1395 * didn't specify sec= mount option.
1396 */
1397 for (i = 0; i < args->auth_flavor_len; i++)
1398 for (j = 0; j < server_authlist_len; j++)
1399 if (args->auth_flavors[i] == request->auth_flavs[j]) {
1400 dfprintk(MOUNT, "NFS: using auth flavor %d\n",
1401 request->auth_flavs[j]);
1402 args->auth_flavors[0] = request->auth_flavs[j];
1403 return 0;
1404 }
1405
1406 dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
1407 nfs_umount(request);
1408 return -EACCES;
1409}
1410
1411/*
1448 * Use the remote server's MOUNT service to request the NFS file handle 1412 * Use the remote server's MOUNT service to request the NFS file handle
1449 * corresponding to the provided path. 1413 * corresponding to the provided path.
1450 */ 1414 */
1451static int nfs_try_mount(struct nfs_parsed_mount_data *args, 1415static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1452 struct nfs_fh *root_fh) 1416 struct nfs_fh *root_fh)
1453{ 1417{
1454 unsigned int auth_flavor_len = 0; 1418 rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
1419 unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
1455 struct nfs_mount_request request = { 1420 struct nfs_mount_request request = {
1456 .sap = (struct sockaddr *) 1421 .sap = (struct sockaddr *)
1457 &args->mount_server.address, 1422 &args->mount_server.address,
@@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1459 .protocol = args->mount_server.protocol, 1424 .protocol = args->mount_server.protocol,
1460 .fh = root_fh, 1425 .fh = root_fh,
1461 .noresvport = args->flags & NFS_MOUNT_NORESVPORT, 1426 .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
1462 .auth_flav_len = &auth_flavor_len, 1427 .auth_flav_len = &server_authlist_len,
1428 .auth_flavs = server_authlist,
1463 }; 1429 };
1464 int status; 1430 int status;
1465 1431
@@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1485 args->mount_server.addrlen = args->nfs_server.addrlen; 1451 args->mount_server.addrlen = args->nfs_server.addrlen;
1486 } 1452 }
1487 request.salen = args->mount_server.addrlen; 1453 request.salen = args->mount_server.addrlen;
1488 1454 nfs_set_default_port(request.sap, args->mount_server.port, 0);
1489 /*
1490 * autobind will be used if mount_server.port == 0
1491 */
1492 nfs_set_port(request.sap, args->mount_server.port);
1493 1455
1494 /* 1456 /*
1495 * Now ask the mount server to map our export path 1457 * Now ask the mount server to map our export path
1496 * to a file handle. 1458 * to a file handle.
1497 */ 1459 */
1498 status = nfs_mount(&request); 1460 status = nfs_mount(&request);
1499 if (status == 0) 1461 if (status != 0) {
1500 return 0; 1462 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
1463 request.hostname, status);
1464 return status;
1465 }
1501 1466
1502 dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", 1467 /*
1503 request.hostname, status); 1468 * MNTv1 (NFSv2) does not support auth flavor negotiation.
1504 return status; 1469 */
1470 if (args->mount_server.version != NFS_MNT3_VERSION)
1471 return 0;
1472 return nfs_walk_authlist(args, &request);
1505} 1473}
1506 1474
1507static int nfs_parse_simple_hostname(const char *dev_name, 1475static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options,
1661 const char *dev_name) 1629 const char *dev_name)
1662{ 1630{
1663 struct nfs_mount_data *data = (struct nfs_mount_data *)options; 1631 struct nfs_mount_data *data = (struct nfs_mount_data *)options;
1632 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
1664 1633
1665 if (data == NULL) 1634 if (data == NULL)
1666 goto out_no_data; 1635 goto out_no_data;
@@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options,
1672 args->acregmax = NFS_DEF_ACREGMAX; 1641 args->acregmax = NFS_DEF_ACREGMAX;
1673 args->acdirmin = NFS_DEF_ACDIRMIN; 1642 args->acdirmin = NFS_DEF_ACDIRMIN;
1674 args->acdirmax = NFS_DEF_ACDIRMAX; 1643 args->acdirmax = NFS_DEF_ACDIRMAX;
1675 args->mount_server.port = 0; /* autobind unless user sets port */ 1644 args->mount_server.port = NFS_UNSPEC_PORT;
1676 args->nfs_server.port = 0; /* autobind unless user sets port */ 1645 args->nfs_server.port = NFS_UNSPEC_PORT;
1677 args->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1646 args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1678 args->auth_flavors[0] = RPC_AUTH_UNIX; 1647 args->auth_flavors[0] = RPC_AUTH_UNIX;
1648 args->auth_flavor_len = 1;
1649 args->minorversion = 0;
1679 1650
1680 switch (data->version) { 1651 switch (data->version) {
1681 case 1: 1652 case 1:
@@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
1697 if (data->root.size > NFS3_FHSIZE || data->root.size == 0) 1668 if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
1698 goto out_invalid_fh; 1669 goto out_invalid_fh;
1699 mntfh->size = data->root.size; 1670 mntfh->size = data->root.size;
1700 } else 1671 args->version = 3;
1672 } else {
1701 mntfh->size = NFS2_FHSIZE; 1673 mntfh->size = NFS2_FHSIZE;
1674 args->version = 2;
1675 }
1702 1676
1703 1677
1704 memcpy(mntfh->data, data->root.data, mntfh->size); 1678 memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options,
1720 args->acdirmin = data->acdirmin; 1694 args->acdirmin = data->acdirmin;
1721 args->acdirmax = data->acdirmax; 1695 args->acdirmax = data->acdirmax;
1722 1696
1723 memcpy(&args->nfs_server.address, &data->addr, 1697 memcpy(sap, &data->addr, sizeof(data->addr));
1724 sizeof(data->addr));
1725 args->nfs_server.addrlen = sizeof(data->addr); 1698 args->nfs_server.addrlen = sizeof(data->addr);
1726 if (!nfs_verify_server_address((struct sockaddr *) 1699 if (!nfs_verify_server_address(sap))
1727 &args->nfs_server.address))
1728 goto out_no_address; 1700 goto out_no_address;
1729 1701
1730 if (!(data->flags & NFS_MOUNT_TCP)) 1702 if (!(data->flags & NFS_MOUNT_TCP))
@@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options,
1772 if (nfs_parse_mount_options((char *)options, args) == 0) 1744 if (nfs_parse_mount_options((char *)options, args) == 0)
1773 return -EINVAL; 1745 return -EINVAL;
1774 1746
1775 if (!nfs_verify_server_address((struct sockaddr *) 1747 if (!nfs_verify_server_address(sap))
1776 &args->nfs_server.address))
1777 goto out_no_address; 1748 goto out_no_address;
1778 1749
1779 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 1750 if (args->version == 4)
1780 args->nfs_server.port); 1751#ifdef CONFIG_NFS_V4
1752 return nfs4_validate_text_mount_data(options,
1753 args, dev_name);
1754#else
1755 goto out_v4_not_compiled;
1756#endif
1757
1758 nfs_set_default_port(sap, args->nfs_server.port, 0);
1781 1759
1782 nfs_set_mount_transport_protocol(args); 1760 nfs_set_mount_transport_protocol(args);
1783 1761
@@ -1825,6 +1803,12 @@ out_v3_not_compiled:
1825 return -EPROTONOSUPPORT; 1803 return -EPROTONOSUPPORT;
1826#endif /* !CONFIG_NFS_V3 */ 1804#endif /* !CONFIG_NFS_V3 */
1827 1805
1806#ifndef CONFIG_NFS_V4
1807out_v4_not_compiled:
1808 dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
1809 return -EPROTONOSUPPORT;
1810#endif /* !CONFIG_NFS_V4 */
1811
1828out_nomem: 1812out_nomem:
1829 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 1813 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
1830 return -ENOMEM; 1814 return -ENOMEM;
@@ -1934,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
1934 if (server->flags & NFS_MOUNT_NOAC) 1918 if (server->flags & NFS_MOUNT_NOAC)
1935 sb->s_flags |= MS_SYNCHRONOUS; 1919 sb->s_flags |= MS_SYNCHRONOUS;
1936 1920
1921 sb->s_bdi = &server->backing_dev_info;
1922
1937 nfs_super_set_maxbytes(sb, server->maxfilesize); 1923 nfs_super_set_maxbytes(sb, server->maxfilesize);
1938} 1924}
1939 1925
@@ -2120,6 +2106,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2120 if (error < 0) 2106 if (error < 0)
2121 goto out; 2107 goto out;
2122 2108
2109#ifdef CONFIG_NFS_V4
2110 if (data->version == 4) {
2111 error = nfs4_try_mount(flags, dev_name, data, mnt);
2112 kfree(data->client_address);
2113 goto out;
2114 }
2115#endif /* CONFIG_NFS_V4 */
2116
2123 /* Get a volume representation */ 2117 /* Get a volume representation */
2124 server = nfs_create_server(data, mntfh); 2118 server = nfs_create_server(data, mntfh);
2125 if (IS_ERR(server)) { 2119 if (IS_ERR(server)) {
@@ -2317,6 +2311,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
2317 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); 2311 args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
2318} 2312}
2319 2313
2314static int nfs4_validate_text_mount_data(void *options,
2315 struct nfs_parsed_mount_data *args,
2316 const char *dev_name)
2317{
2318 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2319
2320 nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
2321
2322 nfs_validate_transport_protocol(args);
2323
2324 nfs4_validate_mount_flags(args);
2325
2326 if (args->version != 4) {
2327 dfprintk(MOUNT,
2328 "NFS4: Illegal mount version\n");
2329 return -EINVAL;
2330 }
2331
2332 if (args->auth_flavor_len > 1) {
2333 dfprintk(MOUNT,
2334 "NFS4: Too many RPC auth flavours specified\n");
2335 return -EINVAL;
2336 }
2337
2338 if (args->client_address == NULL) {
2339 dfprintk(MOUNT,
2340 "NFS4: mount program didn't pass callback address\n");
2341 return -EINVAL;
2342 }
2343
2344 return nfs_parse_devname(dev_name,
2345 &args->nfs_server.hostname,
2346 NFS4_MAXNAMLEN,
2347 &args->nfs_server.export_path,
2348 NFS4_MAXPATHLEN);
2349}
2350
2320/* 2351/*
2321 * Validate NFSv4 mount options 2352 * Validate NFSv4 mount options
2322 */ 2353 */
@@ -2324,7 +2355,7 @@ static int nfs4_validate_mount_data(void *options,
2324 struct nfs_parsed_mount_data *args, 2355 struct nfs_parsed_mount_data *args,
2325 const char *dev_name) 2356 const char *dev_name)
2326{ 2357{
2327 struct sockaddr_in *ap; 2358 struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
2328 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; 2359 struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
2329 char *c; 2360 char *c;
2330 2361
@@ -2337,23 +2368,22 @@ static int nfs4_validate_mount_data(void *options,
2337 args->acregmax = NFS_DEF_ACREGMAX; 2368 args->acregmax = NFS_DEF_ACREGMAX;
2338 args->acdirmin = NFS_DEF_ACDIRMIN; 2369 args->acdirmin = NFS_DEF_ACDIRMIN;
2339 args->acdirmax = NFS_DEF_ACDIRMAX; 2370 args->acdirmax = NFS_DEF_ACDIRMAX;
2340 args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ 2371 args->nfs_server.port = NFS_UNSPEC_PORT;
2341 args->auth_flavors[0] = RPC_AUTH_UNIX; 2372 args->auth_flavors[0] = RPC_AUTH_UNIX;
2342 args->auth_flavor_len = 0; 2373 args->auth_flavor_len = 1;
2374 args->version = 4;
2343 args->minorversion = 0; 2375 args->minorversion = 0;
2344 2376
2345 switch (data->version) { 2377 switch (data->version) {
2346 case 1: 2378 case 1:
2347 ap = (struct sockaddr_in *)&args->nfs_server.address;
2348 if (data->host_addrlen > sizeof(args->nfs_server.address)) 2379 if (data->host_addrlen > sizeof(args->nfs_server.address))
2349 goto out_no_address; 2380 goto out_no_address;
2350 if (data->host_addrlen == 0) 2381 if (data->host_addrlen == 0)
2351 goto out_no_address; 2382 goto out_no_address;
2352 args->nfs_server.addrlen = data->host_addrlen; 2383 args->nfs_server.addrlen = data->host_addrlen;
2353 if (copy_from_user(ap, data->host_addr, data->host_addrlen)) 2384 if (copy_from_user(sap, data->host_addr, data->host_addrlen))
2354 return -EFAULT; 2385 return -EFAULT;
2355 if (!nfs_verify_server_address((struct sockaddr *) 2386 if (!nfs_verify_server_address(sap))
2356 &args->nfs_server.address))
2357 goto out_no_address; 2387 goto out_no_address;
2358 2388
2359 if (data->auth_flavourlen) { 2389 if (data->auth_flavourlen) {
@@ -2399,39 +2429,14 @@ static int nfs4_validate_mount_data(void *options,
2399 nfs_validate_transport_protocol(args); 2429 nfs_validate_transport_protocol(args);
2400 2430
2401 break; 2431 break;
2402 default: { 2432 default:
2403 int status;
2404
2405 if (nfs_parse_mount_options((char *)options, args) == 0) 2433 if (nfs_parse_mount_options((char *)options, args) == 0)
2406 return -EINVAL; 2434 return -EINVAL;
2407 2435
2408 if (!nfs_verify_server_address((struct sockaddr *) 2436 if (!nfs_verify_server_address(sap))
2409 &args->nfs_server.address))
2410 return -EINVAL; 2437 return -EINVAL;
2411 2438
2412 nfs_set_port((struct sockaddr *)&args->nfs_server.address, 2439 return nfs4_validate_text_mount_data(options, args, dev_name);
2413 args->nfs_server.port);
2414
2415 nfs_validate_transport_protocol(args);
2416
2417 nfs4_validate_mount_flags(args);
2418
2419 if (args->auth_flavor_len > 1)
2420 goto out_inval_auth;
2421
2422 if (args->client_address == NULL)
2423 goto out_no_client_address;
2424
2425 status = nfs_parse_devname(dev_name,
2426 &args->nfs_server.hostname,
2427 NFS4_MAXNAMLEN,
2428 &args->nfs_server.export_path,
2429 NFS4_MAXPATHLEN);
2430 if (status < 0)
2431 return status;
2432
2433 break;
2434 }
2435 } 2440 }
2436 2441
2437 return 0; 2442 return 0;
@@ -2448,10 +2453,6 @@ out_inval_auth:
2448out_no_address: 2453out_no_address:
2449 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); 2454 dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
2450 return -EINVAL; 2455 return -EINVAL;
2451
2452out_no_client_address:
2453 dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
2454 return -EINVAL;
2455} 2456}
2456 2457
2457/* 2458/*
@@ -2618,6 +2619,34 @@ out_err:
2618 return ret; 2619 return ret;
2619} 2620}
2620 2621
2622static int nfs4_try_mount(int flags, const char *dev_name,
2623 struct nfs_parsed_mount_data *data,
2624 struct vfsmount *mnt)
2625{
2626 char *export_path;
2627 struct vfsmount *root_mnt;
2628 int error;
2629
2630 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
2631
2632 export_path = data->nfs_server.export_path;
2633 data->nfs_server.export_path = "/";
2634 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2635 data->nfs_server.hostname);
2636 data->nfs_server.export_path = export_path;
2637
2638 error = PTR_ERR(root_mnt);
2639 if (IS_ERR(root_mnt))
2640 goto out;
2641
2642 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2643
2644out:
2645 dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
2646 error != 0 ? " [error]" : "");
2647 return error;
2648}
2649
2621/* 2650/*
2622 * Get the superblock for an NFS4 mountpoint 2651 * Get the superblock for an NFS4 mountpoint
2623 */ 2652 */
@@ -2625,8 +2654,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2625 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2654 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
2626{ 2655{
2627 struct nfs_parsed_mount_data *data; 2656 struct nfs_parsed_mount_data *data;
2628 char *export_path;
2629 struct vfsmount *root_mnt;
2630 int error = -ENOMEM; 2657 int error = -ENOMEM;
2631 2658
2632 data = kzalloc(sizeof(*data), GFP_KERNEL); 2659 data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2638,17 +2665,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2638 if (error < 0) 2665 if (error < 0)
2639 goto out; 2666 goto out;
2640 2667
2641 export_path = data->nfs_server.export_path; 2668 error = nfs4_try_mount(flags, dev_name, data, mnt);
2642 data->nfs_server.export_path = "/";
2643 root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
2644 data->nfs_server.hostname);
2645 data->nfs_server.export_path = export_path;
2646
2647 error = PTR_ERR(root_mnt);
2648 if (IS_ERR(root_mnt))
2649 goto out;
2650
2651 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2652 2669
2653out: 2670out:
2654 kfree(data->client_address); 2671 kfree(data->client_address);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index a34fae21fe10..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/migrate.h>
16 17
17#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
18#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
26#include "internal.h" 27#include "internal.h"
27#include "iostat.h" 28#include "iostat.h"
28#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h"
29 31
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 32#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 33
@@ -218,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
218 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); 220 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
219} 221}
220 222
221/* 223static struct nfs_page *nfs_find_and_lock_request(struct page *page)
222 * Find an associated nfs write request, and prepare to flush it out
223 * May return an error if the user signalled nfs_wait_on_request().
224 */
225static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
226 struct page *page)
227{ 224{
228 struct inode *inode = page->mapping->host; 225 struct inode *inode = page->mapping->host;
229 struct nfs_page *req; 226 struct nfs_page *req;
230 int ret; 227 int ret;
231 228
232 spin_lock(&inode->i_lock); 229 spin_lock(&inode->i_lock);
233 for(;;) { 230 for (;;) {
234 req = nfs_page_find_request_locked(page); 231 req = nfs_page_find_request_locked(page);
235 if (req == NULL) { 232 if (req == NULL)
236 spin_unlock(&inode->i_lock); 233 break;
237 return 0;
238 }
239 if (nfs_set_page_tag_locked(req)) 234 if (nfs_set_page_tag_locked(req))
240 break; 235 break;
241 /* Note: If we hold the page lock, as is the case in nfs_writepage, 236 /* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -247,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
247 ret = nfs_wait_on_request(req); 242 ret = nfs_wait_on_request(req);
248 nfs_release_request(req); 243 nfs_release_request(req);
249 if (ret != 0) 244 if (ret != 0)
250 return ret; 245 return ERR_PTR(ret);
251 spin_lock(&inode->i_lock); 246 spin_lock(&inode->i_lock);
252 } 247 }
253 if (test_bit(PG_CLEAN, &req->wb_flags)) {
254 spin_unlock(&inode->i_lock);
255 BUG();
256 }
257 if (nfs_set_page_writeback(page) != 0) {
258 spin_unlock(&inode->i_lock);
259 BUG();
260 }
261 spin_unlock(&inode->i_lock); 248 spin_unlock(&inode->i_lock);
249 return req;
250}
251
252/*
253 * Find an associated nfs write request, and prepare to flush it out
254 * May return an error if the user signalled nfs_wait_on_request().
255 */
256static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
257 struct page *page)
258{
259 struct nfs_page *req;
260 int ret = 0;
261
262 req = nfs_find_and_lock_request(page);
263 if (!req)
264 goto out;
265 ret = PTR_ERR(req);
266 if (IS_ERR(req))
267 goto out;
268
269 ret = nfs_set_page_writeback(page);
270 BUG_ON(ret != 0);
271 BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
272
262 if (!nfs_pageio_add_request(pgio, req)) { 273 if (!nfs_pageio_add_request(pgio, req)) {
263 nfs_redirty_request(req); 274 nfs_redirty_request(req);
264 return pgio->pg_error; 275 ret = pgio->pg_error;
265 } 276 }
266 return 0; 277out:
278 return ret;
267} 279}
268 280
269static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) 281static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1478,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
1478 .nr_to_write = LONG_MAX, 1490 .nr_to_write = LONG_MAX,
1479 .range_start = 0, 1491 .range_start = 0,
1480 .range_end = LLONG_MAX, 1492 .range_end = LLONG_MAX,
1481 .for_writepages = 1,
1482 }; 1493 };
1483 1494
1484 return __nfs_write_mapping(mapping, &wbc, how); 1495 return __nfs_write_mapping(mapping, &wbc, how);
@@ -1580,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
1580 return nfs_wb_page_priority(inode, page, FLUSH_STABLE); 1591 return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
1581} 1592}
1582 1593
1594#ifdef CONFIG_MIGRATION
1595int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1596 struct page *page)
1597{
1598 struct nfs_page *req;
1599 int ret;
1600
1601 if (PageFsCache(page))
1602 nfs_fscache_release_page(page, GFP_KERNEL);
1603
1604 req = nfs_find_and_lock_request(page);
1605 ret = PTR_ERR(req);
1606 if (IS_ERR(req))
1607 goto out;
1608
1609 ret = migrate_page(mapping, newpage, page);
1610 if (!req)
1611 goto out;
1612 if (ret)
1613 goto out_unlock;
1614 page_cache_get(newpage);
1615 req->wb_page = newpage;
1616 SetPagePrivate(newpage);
1617 set_page_private(newpage, page_private(page));
1618 ClearPagePrivate(page);
1619 set_page_private(page, 0);
1620 page_cache_release(page);
1621out_unlock:
1622 nfs_clear_page_tag_locked(req);
1623 nfs_release_request(req);
1624out:
1625 return ret;
1626}
1627#endif
1628
1583int __init nfs_init_writepagecache(void) 1629int __init nfs_init_writepagecache(void)
1584{ 1630{
1585 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1631 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
34 int flags = nfsexp_flags(rqstp, exp); 34 int flags = nfsexp_flags(rqstp, exp);
35 int ret; 35 int ret;
36 36
37 validate_process_creds();
38
37 /* discard any old override before preparing the new set */ 39 /* discard any old override before preparing the new set */
38 revert_creds(get_cred(current->real_cred)); 40 revert_creds(get_cred(current->real_cred));
39 new = prepare_creds(); 41 new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
86 else 88 else
87 new->cap_effective = cap_raise_nfsd_set(new->cap_effective, 89 new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
88 new->cap_permitted); 90 new->cap_permitted);
91 validate_process_creds();
89 put_cred(override_creds(new)); 92 put_cred(override_creds(new));
90 put_cred(new); 93 put_cred(new);
94 validate_process_creds();
91 return 0; 95 return 0;
92 96
93oom: 97oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..d9462643155c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
85 (*bpp)[-1] = '\n'; 85 (*bpp)[-1] = '\n';
86} 86}
87 87
88static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
89{
90 return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
91}
92
88static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); 93static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
89static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); 94static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
90static struct cache_detail svc_expkey_cache; 95static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
259 .hash_table = expkey_table, 264 .hash_table = expkey_table,
260 .name = "nfsd.fh", 265 .name = "nfsd.fh",
261 .cache_put = expkey_put, 266 .cache_put = expkey_put,
262 .cache_request = expkey_request, 267 .cache_upcall = expkey_upcall,
263 .cache_parse = expkey_parse, 268 .cache_parse = expkey_parse,
264 .cache_show = expkey_show, 269 .cache_show = expkey_show,
265 .match = expkey_match, 270 .match = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
355 (*bpp)[-1] = '\n'; 360 (*bpp)[-1] = '\n';
356} 361}
357 362
363static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
364{
365 return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
366}
367
358static struct svc_export *svc_export_update(struct svc_export *new, 368static struct svc_export *svc_export_update(struct svc_export *new,
359 struct svc_export *old); 369 struct svc_export *old);
360static struct svc_export *svc_export_lookup(struct svc_export *); 370static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
724 .hash_table = export_table, 734 .hash_table = export_table,
725 .name = "nfsd.export", 735 .name = "nfsd.export",
726 .cache_put = svc_export_put, 736 .cache_put = svc_export_put,
727 .cache_request = svc_export_request, 737 .cache_upcall = svc_export_upcall,
728 .cache_parse = svc_export_parse, 738 .cache_parse = svc_export_parse,
729 .cache_show = svc_export_show, 739 .cache_show = svc_export_show,
730 .match = svc_export_match, 740 .match = svc_export_match,
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
146} 146}
147 147
148static int 148static int
149idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
150{
151 return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
152}
153
154static int
149idtoname_match(struct cache_head *ca, struct cache_head *cb) 155idtoname_match(struct cache_head *ca, struct cache_head *cb)
150{ 156{
151 struct ent *a = container_of(ca, struct ent, h); 157 struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
175} 181}
176 182
177static void 183static void
178warn_no_idmapd(struct cache_detail *detail) 184warn_no_idmapd(struct cache_detail *detail, int has_died)
179{ 185{
180 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", 186 printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
181 detail->last_close? "died" : "not been started"); 187 has_died ? "died" : "not been started");
182} 188}
183 189
184 190
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
192 .hash_table = idtoname_table, 198 .hash_table = idtoname_table,
193 .name = "nfs4.idtoname", 199 .name = "nfs4.idtoname",
194 .cache_put = ent_put, 200 .cache_put = ent_put,
195 .cache_request = idtoname_request, 201 .cache_upcall = idtoname_upcall,
196 .cache_parse = idtoname_parse, 202 .cache_parse = idtoname_parse,
197 .cache_show = idtoname_show, 203 .cache_show = idtoname_show,
198 .warn_no_listener = warn_no_idmapd, 204 .warn_no_listener = warn_no_idmapd,
@@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
325} 331}
326 332
327static int 333static int
334nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
335{
336 return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
337}
338
339static int
328nametoid_match(struct cache_head *ca, struct cache_head *cb) 340nametoid_match(struct cache_head *ca, struct cache_head *cb)
329{ 341{
330 struct ent *a = container_of(ca, struct ent, h); 342 struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
363 .hash_table = nametoid_table, 375 .hash_table = nametoid_table,
364 .name = "nfs4.nametoid", 376 .name = "nfs4.nametoid",
365 .cache_put = ent_put, 377 .cache_put = ent_put,
366 .cache_request = nametoid_request, 378 .cache_upcall = nametoid_upcall,
367 .cache_parse = nametoid_parse, 379 .cache_parse = nametoid_parse,
368 .cache_show = nametoid_show, 380 .cache_show = nametoid_show,
369 .warn_no_listener = warn_no_idmapd, 381 .warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..7e906c5b7671 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
37#include <linux/nfsd/xdr.h> 37#include <linux/nfsd/xdr.h>
38#include <linux/nfsd/syscall.h> 38#include <linux/nfsd/syscall.h>
39#include <linux/lockd/lockd.h> 39#include <linux/lockd/lockd.h>
40#include <linux/sunrpc/clnt.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <net/ipv6.h> 43#include <net/ipv6.h>
@@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
490 * 491 *
491 * Input: 492 * Input:
492 * buf: '\n'-terminated C string containing a 493 * buf: '\n'-terminated C string containing a
493 * presentation format IPv4 address 494 * presentation format IP address
494 * size: length of C string in @buf 495 * size: length of C string in @buf
495 * Output: 496 * Output:
496 * On success: returns zero if all specified locks were released; 497 * On success: returns zero if all specified locks were released;
497 * returns one if one or more locks were not released 498 * returns one if one or more locks were not released
498 * On error: return code is negative errno value 499 * On error: return code is negative errno value
499 *
500 * Note: Only AF_INET client addresses are passed in
501 */ 500 */
502static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) 501static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
503{ 502{
504 struct sockaddr_in sin = { 503 struct sockaddr_storage address;
505 .sin_family = AF_INET, 504 struct sockaddr *sap = (struct sockaddr *)&address;
506 }; 505 size_t salen = sizeof(address);
507 int b1, b2, b3, b4;
508 char c;
509 char *fo_path; 506 char *fo_path;
510 507
511 /* sanity check */ 508 /* sanity check */
@@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
519 if (qword_get(&buf, fo_path, size) < 0) 516 if (qword_get(&buf, fo_path, size) < 0)
520 return -EINVAL; 517 return -EINVAL;
521 518
522 /* get ipv4 address */ 519 if (rpc_pton(fo_path, size, sap, salen) == 0)
523 if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
524 return -EINVAL;
525 if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
526 return -EINVAL; 520 return -EINVAL;
527 sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
528 521
529 return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); 522 return nlmsvc_unlock_all_by_ip(sap);
530} 523}
531 524
532/** 525/**
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
496 /* Lock the export hash tables for reading. */ 496 /* Lock the export hash tables for reading. */
497 exp_readlock(); 497 exp_readlock();
498 498
499 validate_process_creds();
499 svc_process(rqstp); 500 svc_process(rqstp);
501 validate_process_creds();
500 502
501 /* Unlock export hash tables */ 503 /* Unlock export hash tables */
502 exp_readunlock(); 504 exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
684 __be32 err; 684 __be32 err;
685 int host_err; 685 int host_err;
686 686
687 validate_process_creds();
688
687 /* 689 /*
688 * If we get here, then the client has already done an "open", 690 * If we get here, then the client has already done an "open",
689 * and (hopefully) checked permission - so allow OWNER_OVERRIDE 691 * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
740out_nfserr: 742out_nfserr:
741 err = nfserrno(host_err); 743 err = nfserrno(host_err);
742out: 744out:
745 validate_process_creds();
743 return err; 746 return err;
744} 747}
745 748
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
1config NILFS2_FS 1config NILFS2_FS
2 tristate "NILFS2 file system support (EXPERIMENTAL)" 2 tristate "NILFS2 file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on EXPERIMENTAL
4 select CRC32 4 select CRC32
5 help 5 help
6 NILFS2 is a log-structured file system (LFS) supporting continuous 6 NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); 36 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
37} 37}
38 38
39/**
40 * nilfs_bmap_lookup_at_level - find a data block or node block
41 * @bmap: bmap
42 * @key: key
43 * @level: level
44 * @ptrp: place to store the value associated to @key
45 *
46 * Description: nilfs_bmap_lookup_at_level() finds a record whose key
47 * matches @key in the block at @level of the bmap.
48 *
49 * Return Value: On success, 0 is returned and the record associated with @key
50 * is stored in the place pointed by @ptrp. On error, one of the following
51 * negative error codes is returned.
52 *
53 * %-EIO - I/O error.
54 *
55 * %-ENOMEM - Insufficient amount of memory available.
56 *
57 * %-ENOENT - A record associated with @key does not exist.
58 */
39int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, 59int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
40 __u64 *ptrp) 60 __u64 *ptrp)
41{ 61{
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
69 return ret; 89 return ret;
70} 90}
71 91
72/**
73 * nilfs_bmap_lookup - find a record
74 * @bmap: bmap
75 * @key: key
76 * @recp: pointer to record
77 *
78 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
79 * @bmap.
80 *
81 * Return Value: On success, 0 is returned and the record associated with @key
82 * is stored in the place pointed by @recp. On error, one of the following
83 * negative error codes is returned.
84 *
85 * %-EIO - I/O error.
86 *
87 * %-ENOMEM - Insufficient amount of memory available.
88 *
89 * %-ENOENT - A record associated with @key does not exist.
90 */
91int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
92 unsigned long key,
93 unsigned long *recp)
94{
95 __u64 ptr;
96 int ret;
97
98 /* XXX: use macro for level 1 */
99 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
100 if (recp != NULL)
101 *recp = ptr;
102 return ret;
103}
104
105static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 92static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
106{ 93{
107 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; 94 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
469 (entries_per_group / NILFS_BMAP_GROUP_DIV); 456 (entries_per_group / NILFS_BMAP_GROUP_DIV);
470} 457}
471 458
472int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
473 union nilfs_bmap_ptr_req *req)
474{
475 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
476}
477
478void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
479 union nilfs_bmap_ptr_req *req)
480{
481 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
482}
483
484void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
485 union nilfs_bmap_ptr_req *req)
486{
487 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
488}
489
490int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
491 sector_t blocknr)
492{
493 struct inode *dat = nilfs_bmap_get_dat(bmap);
494 int ret;
495
496 ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
497 if (likely(!ret))
498 nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
499 return ret;
500}
501
502int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
512 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
513}
514
515void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
516 union nilfs_bmap_ptr_req *req)
517{
518 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
519}
520
521int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
522 sector_t blocknr)
523{
524 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
525}
526
527int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
528{
529 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
530}
531
532int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
533 union nilfs_bmap_ptr_req *oldreq,
534 union nilfs_bmap_ptr_req *newreq)
535{
536 struct inode *dat = nilfs_bmap_get_dat(bmap);
537 int ret;
538
539 ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
540 if (ret < 0)
541 return ret;
542 ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
543 if (ret < 0)
544 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
545
546 return ret;
547}
548
549void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
550 union nilfs_bmap_ptr_req *oldreq,
551 union nilfs_bmap_ptr_req *newreq)
552{
553 struct inode *dat = nilfs_bmap_get_dat(bmap);
554
555 nilfs_dat_commit_end(dat, &oldreq->bpr_req,
556 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
557 nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
558}
559
560void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
561 union nilfs_bmap_ptr_req *oldreq,
562 union nilfs_bmap_ptr_req *newreq)
563{
564 struct inode *dat = nilfs_bmap_get_dat(bmap);
565
566 nilfs_dat_abort_end(dat, &oldreq->bpr_req);
567 nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
568}
569
570static struct lock_class_key nilfs_bmap_dat_lock_key; 459static struct lock_class_key nilfs_bmap_dat_lock_key;
571static struct lock_class_key nilfs_bmap_mdt_lock_key; 460static struct lock_class_key nilfs_bmap_mdt_lock_key;
572 461
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "alloc.h" 30#include "alloc.h"
31#include "dat.h"
31 32
32#define NILFS_BMAP_INVALID_PTR 0 33#define NILFS_BMAP_INVALID_PTR 0
33 34
@@ -141,7 +142,6 @@ struct nilfs_bmap {
141int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); 142int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
142int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); 143int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
143void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); 144void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
144int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); 145int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); 146int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); 147int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); 160void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
161 161
162 162
163static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
164 __u64 *ptr)
165{
166 return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
167}
168
163/* 169/*
164 * Internal use only 170 * Internal use only
165 */ 171 */
166struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); 172struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
167int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
168 union nilfs_bmap_ptr_req *);
169void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
170 union nilfs_bmap_ptr_req *);
171void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
172 union nilfs_bmap_ptr_req *);
173 173
174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, 174static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
175 union nilfs_bmap_ptr_req *req) 175 union nilfs_bmap_ptr_req *req,
176 struct inode *dat)
176{ 177{
177 if (NILFS_BMAP_USE_VBN(bmap)) 178 if (dat)
178 return nilfs_bmap_prepare_alloc_v(bmap, req); 179 return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
179 /* ignore target ptr */ 180 /* ignore target ptr */
180 req->bpr_ptr = bmap->b_last_allocated_ptr++; 181 req->bpr_ptr = bmap->b_last_allocated_ptr++;
181 return 0; 182 return 0;
182} 183}
183 184
184static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, 185static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
185 union nilfs_bmap_ptr_req *req) 186 union nilfs_bmap_ptr_req *req,
187 struct inode *dat)
186{ 188{
187 if (NILFS_BMAP_USE_VBN(bmap)) 189 if (dat)
188 nilfs_bmap_commit_alloc_v(bmap, req); 190 nilfs_dat_commit_alloc(dat, &req->bpr_req);
189} 191}
190 192
191static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, 193static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
192 union nilfs_bmap_ptr_req *req) 194 union nilfs_bmap_ptr_req *req,
195 struct inode *dat)
193{ 196{
194 if (NILFS_BMAP_USE_VBN(bmap)) 197 if (dat)
195 nilfs_bmap_abort_alloc_v(bmap, req); 198 nilfs_dat_abort_alloc(dat, &req->bpr_req);
196 else 199 else
197 bmap->b_last_allocated_ptr--; 200 bmap->b_last_allocated_ptr--;
198} 201}
199 202
200int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
201void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
203
204static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, 203static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
205 union nilfs_bmap_ptr_req *req) 204 union nilfs_bmap_ptr_req *req,
205 struct inode *dat)
206{ 206{
207 return NILFS_BMAP_USE_VBN(bmap) ? 207 return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
208 nilfs_bmap_prepare_end_v(bmap, req) : 0;
209} 208}
210 209
211static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, 210static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
212 union nilfs_bmap_ptr_req *req) 211 union nilfs_bmap_ptr_req *req,
212 struct inode *dat)
213{ 213{
214 if (NILFS_BMAP_USE_VBN(bmap)) 214 if (dat)
215 nilfs_bmap_commit_end_v(bmap, req); 215 nilfs_dat_commit_end(dat, &req->bpr_req,
216 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
216} 217}
217 218
218static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, 219static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
219 union nilfs_bmap_ptr_req *req) 220 union nilfs_bmap_ptr_req *req,
221 struct inode *dat)
220{ 222{
221 if (NILFS_BMAP_USE_VBN(bmap)) 223 if (dat)
222 nilfs_bmap_abort_end_v(bmap, req); 224 nilfs_dat_abort_end(dat, &req->bpr_req);
223} 225}
224 226
225int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
226 sector_t);
227int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
228int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
229
230
231__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, 227__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
232 const struct buffer_head *); 228 const struct buffer_head *);
233 229
234__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 230__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
235__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 231__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
236 232
237int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
238 union nilfs_bmap_ptr_req *,
239 union nilfs_bmap_ptr_req *);
240void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
241 union nilfs_bmap_ptr_req *,
242 union nilfs_bmap_ptr_req *);
243void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
244 union nilfs_bmap_ptr_req *,
245 union nilfs_bmap_ptr_req *);
246
247void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); 233void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
248void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); 234void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
249 235
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
209 * We cannot call radix_tree_preload for the kernels older 209 * We cannot call radix_tree_preload for the kernels older
210 * than 2.6.23, because it is not exported for modules. 210 * than 2.6.23, because it is not exported for modules.
211 */ 211 */
212retry:
212 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 213 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
213 if (err) 214 if (err)
214 goto failed_unlock; 215 goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
219 (unsigned long long)oldkey, 220 (unsigned long long)oldkey,
220 (unsigned long long)newkey); 221 (unsigned long long)newkey);
221 222
222retry:
223 spin_lock_irq(&btnc->tree_lock); 223 spin_lock_irq(&btnc->tree_lock);
224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); 224 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
225 spin_unlock_irq(&btnc->tree_lock); 225 spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
71 kmem_cache_destroy(nilfs_btree_path_cache); 71 kmem_cache_destroy(nilfs_btree_path_cache);
72} 72}
73 73
74static inline struct nilfs_btree_path * 74static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
75nilfs_btree_alloc_path(const struct nilfs_btree *btree)
76{ 75{
77 return (struct nilfs_btree_path *) 76 return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
79} 77}
80 78
81static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, 79static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
82 struct nilfs_btree_path *path)
83{ 80{
84 kmem_cache_free(nilfs_btree_path_cache, path); 81 kmem_cache_free(nilfs_btree_path_cache, path);
85} 82}
86 83
87static void nilfs_btree_init_path(const struct nilfs_btree *btree, 84static void nilfs_btree_init_path(struct nilfs_btree_path *path)
88 struct nilfs_btree_path *path)
89{ 85{
90 int level; 86 int level;
91 87
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
101 } 97 }
102} 98}
103 99
104static void nilfs_btree_clear_path(const struct nilfs_btree *btree, 100static void nilfs_btree_release_path(struct nilfs_btree_path *path)
105 struct nilfs_btree_path *path)
106{ 101{
107 int level; 102 int level;
108 103
109 for (level = NILFS_BTREE_LEVEL_DATA; 104 for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
110 level < NILFS_BTREE_LEVEL_MAX; 105 level++)
111 level++) { 106 brelse(path[level].bp_bh);
112 if (path[level].bp_bh != NULL) {
113 brelse(path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124} 107}
125 108
126/* 109/*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
148} 131}
149 132
150static inline int 133static inline int
151nilfs_btree_node_get_flags(const struct nilfs_btree *btree, 134nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
152 const struct nilfs_btree_node *node)
153{ 135{
154 return node->bn_flags; 136 return node->bn_flags;
155} 137}
156 138
157static inline void 139static inline void
158nilfs_btree_node_set_flags(struct nilfs_btree *btree, 140nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
159 struct nilfs_btree_node *node,
160 int flags)
161{ 141{
162 node->bn_flags = flags; 142 node->bn_flags = flags;
163} 143}
164 144
165static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, 145static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
166 const struct nilfs_btree_node *node)
167{ 146{
168 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; 147 return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
169} 148}
170 149
171static inline int 150static inline int
172nilfs_btree_node_get_level(const struct nilfs_btree *btree, 151nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
173 const struct nilfs_btree_node *node)
174{ 152{
175 return node->bn_level; 153 return node->bn_level;
176} 154}
177 155
178static inline void 156static inline void
179nilfs_btree_node_set_level(struct nilfs_btree *btree, 157nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
180 struct nilfs_btree_node *node,
181 int level)
182{ 158{
183 node->bn_level = level; 159 node->bn_level = level;
184} 160}
185 161
186static inline int 162static inline int
187nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, 163nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
188 const struct nilfs_btree_node *node)
189{ 164{
190 return le16_to_cpu(node->bn_nchildren); 165 return le16_to_cpu(node->bn_nchildren);
191} 166}
192 167
193static inline void 168static inline void
194nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, 169nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
195 struct nilfs_btree_node *node,
196 int nchildren)
197{ 170{
198 node->bn_nchildren = cpu_to_le16(nchildren); 171 node->bn_nchildren = cpu_to_le16(nchildren);
199} 172}
200 173
201static inline int 174static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
202nilfs_btree_node_size(const struct nilfs_btree *btree)
203{ 175{
204 return 1 << btree->bt_bmap.b_inode->i_blkbits; 176 return 1 << btree->bt_bmap.b_inode->i_blkbits;
205} 177}
206 178
207static inline int 179static inline int
208nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, 180nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
209 const struct nilfs_btree_node *node) 181 const struct nilfs_btree *btree)
210{ 182{
211 return nilfs_btree_node_root(btree, node) ? 183 return nilfs_btree_node_root(node) ?
212 NILFS_BTREE_ROOT_NCHILDREN_MIN : 184 NILFS_BTREE_ROOT_NCHILDREN_MIN :
213 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); 185 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
214} 186}
215 187
216static inline int 188static inline int
217nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, 189nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
218 const struct nilfs_btree_node *node) 190 const struct nilfs_btree *btree)
219{ 191{
220 return nilfs_btree_node_root(btree, node) ? 192 return nilfs_btree_node_root(node) ?
221 NILFS_BTREE_ROOT_NCHILDREN_MAX : 193 NILFS_BTREE_ROOT_NCHILDREN_MAX :
222 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); 194 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
223} 195}
224 196
225static inline __le64 * 197static inline __le64 *
226nilfs_btree_node_dkeys(const struct nilfs_btree *btree, 198nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
227 const struct nilfs_btree_node *node)
228{ 199{
229 return (__le64 *)((char *)(node + 1) + 200 return (__le64 *)((char *)(node + 1) +
230 (nilfs_btree_node_root(btree, node) ? 201 (nilfs_btree_node_root(node) ?
231 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); 202 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
232} 203}
233 204
234static inline __le64 * 205static inline __le64 *
235nilfs_btree_node_dptrs(const struct nilfs_btree *btree, 206nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
236 const struct nilfs_btree_node *node) 207 const struct nilfs_btree *btree)
237{ 208{
238 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + 209 return (__le64 *)(nilfs_btree_node_dkeys(node) +
239 nilfs_btree_node_nchildren_max(btree, node)); 210 nilfs_btree_node_nchildren_max(node, btree));
240} 211}
241 212
242static inline __u64 213static inline __u64
243nilfs_btree_node_get_key(const struct nilfs_btree *btree, 214nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
244 const struct nilfs_btree_node *node, int index)
245{ 215{
246 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + 216 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
247 index));
248} 217}
249 218
250static inline void 219static inline void
251nilfs_btree_node_set_key(struct nilfs_btree *btree, 220nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
252 struct nilfs_btree_node *node, int index, __u64 key)
253{ 221{
254 *(nilfs_btree_node_dkeys(btree, node) + index) = 222 *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
255 nilfs_bmap_key_to_dkey(key);
256} 223}
257 224
258static inline __u64 225static inline __u64
259nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, 226nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
260 const struct nilfs_btree_node *node, 227 const struct nilfs_btree_node *node, int index)
261 int index)
262{ 228{
263 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + 229 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
264 index)); 230 index));
265} 231}
266 232
267static inline void 233static inline void
268nilfs_btree_node_set_ptr(struct nilfs_btree *btree, 234nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
269 struct nilfs_btree_node *node, 235 struct nilfs_btree_node *node, int index, __u64 ptr)
270 int index,
271 __u64 ptr)
272{ 236{
273 *(nilfs_btree_node_dptrs(btree, node) + index) = 237 *(nilfs_btree_node_dptrs(node, btree) + index) =
274 nilfs_bmap_ptr_to_dptr(ptr); 238 nilfs_bmap_ptr_to_dptr(ptr);
275} 239}
276 240
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
283 __le64 *dptrs; 247 __le64 *dptrs;
284 int i; 248 int i;
285 249
286 nilfs_btree_node_set_flags(btree, node, flags); 250 nilfs_btree_node_set_flags(node, flags);
287 nilfs_btree_node_set_level(btree, node, level); 251 nilfs_btree_node_set_level(node, level);
288 nilfs_btree_node_set_nchildren(btree, node, nchildren); 252 nilfs_btree_node_set_nchildren(node, nchildren);
289 253
290 dkeys = nilfs_btree_node_dkeys(btree, node); 254 dkeys = nilfs_btree_node_dkeys(node);
291 dptrs = nilfs_btree_node_dptrs(btree, node); 255 dptrs = nilfs_btree_node_dptrs(node, btree);
292 for (i = 0; i < nchildren; i++) { 256 for (i = 0; i < nchildren; i++) {
293 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); 257 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
294 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); 258 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
305 __le64 *ldptrs, *rdptrs; 269 __le64 *ldptrs, *rdptrs;
306 int lnchildren, rnchildren; 270 int lnchildren, rnchildren;
307 271
308 ldkeys = nilfs_btree_node_dkeys(btree, left); 272 ldkeys = nilfs_btree_node_dkeys(left);
309 ldptrs = nilfs_btree_node_dptrs(btree, left); 273 ldptrs = nilfs_btree_node_dptrs(left, btree);
310 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 274 lnchildren = nilfs_btree_node_get_nchildren(left);
311 275
312 rdkeys = nilfs_btree_node_dkeys(btree, right); 276 rdkeys = nilfs_btree_node_dkeys(right);
313 rdptrs = nilfs_btree_node_dptrs(btree, right); 277 rdptrs = nilfs_btree_node_dptrs(right, btree);
314 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 278 rnchildren = nilfs_btree_node_get_nchildren(right);
315 279
316 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); 280 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
317 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); 281 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
320 284
321 lnchildren += n; 285 lnchildren += n;
322 rnchildren -= n; 286 rnchildren -= n;
323 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 287 nilfs_btree_node_set_nchildren(left, lnchildren);
324 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 288 nilfs_btree_node_set_nchildren(right, rnchildren);
325} 289}
326 290
327/* Assume that the buffer heads corresponding to left and right are locked. */ 291/* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
334 __le64 *ldptrs, *rdptrs; 298 __le64 *ldptrs, *rdptrs;
335 int lnchildren, rnchildren; 299 int lnchildren, rnchildren;
336 300
337 ldkeys = nilfs_btree_node_dkeys(btree, left); 301 ldkeys = nilfs_btree_node_dkeys(left);
338 ldptrs = nilfs_btree_node_dptrs(btree, left); 302 ldptrs = nilfs_btree_node_dptrs(left, btree);
339 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 303 lnchildren = nilfs_btree_node_get_nchildren(left);
340 304
341 rdkeys = nilfs_btree_node_dkeys(btree, right); 305 rdkeys = nilfs_btree_node_dkeys(right);
342 rdptrs = nilfs_btree_node_dptrs(btree, right); 306 rdptrs = nilfs_btree_node_dptrs(right, btree);
343 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 307 rnchildren = nilfs_btree_node_get_nchildren(right);
344 308
345 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); 309 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
346 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); 310 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
349 313
350 lnchildren -= n; 314 lnchildren -= n;
351 rnchildren += n; 315 rnchildren += n;
352 nilfs_btree_node_set_nchildren(btree, left, lnchildren); 316 nilfs_btree_node_set_nchildren(left, lnchildren);
353 nilfs_btree_node_set_nchildren(btree, right, rnchildren); 317 nilfs_btree_node_set_nchildren(right, rnchildren);
354} 318}
355 319
356/* Assume that the buffer head corresponding to node is locked. */ 320/* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
362 __le64 *dptrs; 326 __le64 *dptrs;
363 int nchildren; 327 int nchildren;
364 328
365 dkeys = nilfs_btree_node_dkeys(btree, node); 329 dkeys = nilfs_btree_node_dkeys(node);
366 dptrs = nilfs_btree_node_dptrs(btree, node); 330 dptrs = nilfs_btree_node_dptrs(node, btree);
367 nchildren = nilfs_btree_node_get_nchildren(btree, node); 331 nchildren = nilfs_btree_node_get_nchildren(node);
368 if (index < nchildren) { 332 if (index < nchildren) {
369 memmove(dkeys + index + 1, dkeys + index, 333 memmove(dkeys + index + 1, dkeys + index,
370 (nchildren - index) * sizeof(*dkeys)); 334 (nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
374 dkeys[index] = nilfs_bmap_key_to_dkey(key); 338 dkeys[index] = nilfs_bmap_key_to_dkey(key);
375 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); 339 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
376 nchildren++; 340 nchildren++;
377 nilfs_btree_node_set_nchildren(btree, node, nchildren); 341 nilfs_btree_node_set_nchildren(node, nchildren);
378} 342}
379 343
380/* Assume that the buffer head corresponding to node is locked. */ 344/* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
388 __le64 *dptrs; 352 __le64 *dptrs;
389 int nchildren; 353 int nchildren;
390 354
391 dkeys = nilfs_btree_node_dkeys(btree, node); 355 dkeys = nilfs_btree_node_dkeys(node);
392 dptrs = nilfs_btree_node_dptrs(btree, node); 356 dptrs = nilfs_btree_node_dptrs(node, btree);
393 key = nilfs_bmap_dkey_to_key(dkeys[index]); 357 key = nilfs_bmap_dkey_to_key(dkeys[index]);
394 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); 358 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
395 nchildren = nilfs_btree_node_get_nchildren(btree, node); 359 nchildren = nilfs_btree_node_get_nchildren(node);
396 if (keyp != NULL) 360 if (keyp != NULL)
397 *keyp = key; 361 *keyp = key;
398 if (ptrp != NULL) 362 if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
405 (nchildren - index - 1) * sizeof(*dptrs)); 369 (nchildren - index - 1) * sizeof(*dptrs));
406 } 370 }
407 nchildren--; 371 nchildren--;
408 nilfs_btree_node_set_nchildren(btree, node, nchildren); 372 nilfs_btree_node_set_nchildren(node, nchildren);
409} 373}
410 374
411static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, 375static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
412 const struct nilfs_btree_node *node,
413 __u64 key, int *indexp) 376 __u64 key, int *indexp)
414{ 377{
415 __u64 nkey; 378 __u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
417 380
418 /* binary search */ 381 /* binary search */
419 low = 0; 382 low = 0;
420 high = nilfs_btree_node_get_nchildren(btree, node) - 1; 383 high = nilfs_btree_node_get_nchildren(node) - 1;
421 index = 0; 384 index = 0;
422 s = 0; 385 s = 0;
423 while (low <= high) { 386 while (low <= high) {
424 index = (low + high) / 2; 387 index = (low + high) / 2;
425 nkey = nilfs_btree_node_get_key(btree, node, index); 388 nkey = nilfs_btree_node_get_key(node, index);
426 if (nkey == key) { 389 if (nkey == key) {
427 s = 0; 390 s = 0;
428 goto out; 391 goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
436 } 399 }
437 400
438 /* adjust index */ 401 /* adjust index */
439 if (nilfs_btree_node_get_level(btree, node) > 402 if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
440 NILFS_BTREE_LEVEL_NODE_MIN) { 403 if (s > 0 && index > 0)
441 if ((s > 0) && (index > 0))
442 index--; 404 index--;
443 } else if (s < 0) 405 } else if (s < 0)
444 index++; 406 index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
456} 418}
457 419
458static inline struct nilfs_btree_node * 420static inline struct nilfs_btree_node *
459nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, 421nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
460 const struct nilfs_btree_path *path,
461 int level)
462{ 422{
463 return (struct nilfs_btree_node *)path[level].bp_bh->b_data; 423 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
464} 424}
465 425
466static inline struct nilfs_btree_node * 426static inline struct nilfs_btree_node *
467nilfs_btree_get_sib_node(const struct nilfs_btree *btree, 427nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
468 const struct nilfs_btree_path *path,
469 int level)
470{ 428{
471 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; 429 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
472} 430}
473 431
474static inline int nilfs_btree_height(const struct nilfs_btree *btree) 432static inline int nilfs_btree_height(const struct nilfs_btree *btree)
475{ 433{
476 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) 434 return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
477 + 1;
478} 435}
479 436
480static inline struct nilfs_btree_node * 437static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
484{ 441{
485 return (level == nilfs_btree_height(btree) - 1) ? 442 return (level == nilfs_btree_height(btree) - 1) ?
486 nilfs_btree_get_root(btree) : 443 nilfs_btree_get_root(btree) :
487 nilfs_btree_get_nonroot_node(btree, path, level); 444 nilfs_btree_get_nonroot_node(path, level);
488} 445}
489 446
490static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, 447static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
496 int level, index, found, ret; 453 int level, index, found, ret;
497 454
498 node = nilfs_btree_get_root(btree); 455 node = nilfs_btree_get_root(btree);
499 level = nilfs_btree_node_get_level(btree, node); 456 level = nilfs_btree_node_get_level(node);
500 if ((level < minlevel) || 457 if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
501 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
502 return -ENOENT; 458 return -ENOENT;
503 459
504 found = nilfs_btree_node_lookup(btree, node, key, &index); 460 found = nilfs_btree_node_lookup(node, key, &index);
505 ptr = nilfs_btree_node_get_ptr(btree, node, index); 461 ptr = nilfs_btree_node_get_ptr(btree, node, index);
506 path[level].bp_bh = NULL; 462 path[level].bp_bh = NULL;
507 path[level].bp_index = index; 463 path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
510 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 466 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
511 if (ret < 0) 467 if (ret < 0)
512 return ret; 468 return ret;
513 node = nilfs_btree_get_nonroot_node(btree, path, level); 469 node = nilfs_btree_get_nonroot_node(path, level);
514 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 470 BUG_ON(level != nilfs_btree_node_get_level(node));
515 if (!found) 471 if (!found)
516 found = nilfs_btree_node_lookup(btree, node, key, 472 found = nilfs_btree_node_lookup(node, key, &index);
517 &index);
518 else 473 else
519 index = 0; 474 index = 0;
520 if (index < nilfs_btree_node_nchildren_max(btree, node)) 475 if (index < nilfs_btree_node_nchildren_max(node, btree))
521 ptr = nilfs_btree_node_get_ptr(btree, node, index); 476 ptr = nilfs_btree_node_get_ptr(btree, node, index);
522 else { 477 else {
523 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); 478 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
544 int index, level, ret; 499 int index, level, ret;
545 500
546 node = nilfs_btree_get_root(btree); 501 node = nilfs_btree_get_root(btree);
547 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 502 index = nilfs_btree_node_get_nchildren(node) - 1;
548 if (index < 0) 503 if (index < 0)
549 return -ENOENT; 504 return -ENOENT;
550 level = nilfs_btree_node_get_level(btree, node); 505 level = nilfs_btree_node_get_level(node);
551 ptr = nilfs_btree_node_get_ptr(btree, node, index); 506 ptr = nilfs_btree_node_get_ptr(btree, node, index);
552 path[level].bp_bh = NULL; 507 path[level].bp_bh = NULL;
553 path[level].bp_index = index; 508 path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
556 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); 511 ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
557 if (ret < 0) 512 if (ret < 0)
558 return ret; 513 return ret;
559 node = nilfs_btree_get_nonroot_node(btree, path, level); 514 node = nilfs_btree_get_nonroot_node(path, level);
560 BUG_ON(level != nilfs_btree_node_get_level(btree, node)); 515 BUG_ON(level != nilfs_btree_node_get_level(node));
561 index = nilfs_btree_node_get_nchildren(btree, node) - 1; 516 index = nilfs_btree_node_get_nchildren(node) - 1;
562 ptr = nilfs_btree_node_get_ptr(btree, node, index); 517 ptr = nilfs_btree_node_get_ptr(btree, node, index);
563 path[level].bp_index = index; 518 path[level].bp_index = index;
564 } 519 }
565 520
566 if (keyp != NULL) 521 if (keyp != NULL)
567 *keyp = nilfs_btree_node_get_key(btree, node, index); 522 *keyp = nilfs_btree_node_get_key(node, index);
568 if (ptrp != NULL) 523 if (ptrp != NULL)
569 *ptrp = ptr; 524 *ptrp = ptr;
570 525
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
580 int ret; 535 int ret;
581 536
582 btree = (struct nilfs_btree *)bmap; 537 btree = (struct nilfs_btree *)bmap;
583 path = nilfs_btree_alloc_path(btree); 538 path = nilfs_btree_alloc_path();
584 if (path == NULL) 539 if (path == NULL)
585 return -ENOMEM; 540 return -ENOMEM;
586 nilfs_btree_init_path(btree, path); 541 nilfs_btree_init_path(path);
587 542
588 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 543 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
589 544
590 if (ptrp != NULL) 545 if (ptrp != NULL)
591 *ptrp = ptr; 546 *ptrp = ptr;
592 547
593 nilfs_btree_clear_path(btree, path); 548 nilfs_btree_release_path(path);
594 nilfs_btree_free_path(btree, path); 549 nilfs_btree_free_path(path);
595 550
596 return ret; 551 return ret;
597} 552}
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
608 int level = NILFS_BTREE_LEVEL_NODE_MIN; 563 int level = NILFS_BTREE_LEVEL_NODE_MIN;
609 int ret, cnt, index, maxlevel; 564 int ret, cnt, index, maxlevel;
610 565
611 path = nilfs_btree_alloc_path(btree); 566 path = nilfs_btree_alloc_path();
612 if (path == NULL) 567 if (path == NULL)
613 return -ENOMEM; 568 return -ENOMEM;
614 nilfs_btree_init_path(btree, path); 569 nilfs_btree_init_path(path);
615 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); 570 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
616 if (ret < 0) 571 if (ret < 0)
617 goto out; 572 goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
631 node = nilfs_btree_get_node(btree, path, level); 586 node = nilfs_btree_get_node(btree, path, level);
632 index = path[level].bp_index + 1; 587 index = path[level].bp_index + 1;
633 for (;;) { 588 for (;;) {
634 while (index < nilfs_btree_node_get_nchildren(btree, node)) { 589 while (index < nilfs_btree_node_get_nchildren(node)) {
635 if (nilfs_btree_node_get_key(btree, node, index) != 590 if (nilfs_btree_node_get_key(node, index) !=
636 key + cnt) 591 key + cnt)
637 goto end; 592 goto end;
638 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 593 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
653 /* look-up right sibling node */ 608 /* look-up right sibling node */
654 node = nilfs_btree_get_node(btree, path, level + 1); 609 node = nilfs_btree_get_node(btree, path, level + 1);
655 index = path[level + 1].bp_index + 1; 610 index = path[level + 1].bp_index + 1;
656 if (index >= nilfs_btree_node_get_nchildren(btree, node) || 611 if (index >= nilfs_btree_node_get_nchildren(node) ||
657 nilfs_btree_node_get_key(btree, node, index) != key + cnt) 612 nilfs_btree_node_get_key(node, index) != key + cnt)
658 break; 613 break;
659 ptr2 = nilfs_btree_node_get_ptr(btree, node, index); 614 ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
660 path[level + 1].bp_index = index; 615 path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
664 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); 619 ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
665 if (ret < 0) 620 if (ret < 0)
666 goto out; 621 goto out;
667 node = nilfs_btree_get_nonroot_node(btree, path, level); 622 node = nilfs_btree_get_nonroot_node(path, level);
668 index = 0; 623 index = 0;
669 path[level].bp_index = index; 624 path[level].bp_index = index;
670 } 625 }
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
672 *ptrp = ptr; 627 *ptrp = ptr;
673 ret = cnt; 628 ret = cnt;
674 out: 629 out:
675 nilfs_btree_clear_path(btree, path); 630 nilfs_btree_release_path(path);
676 nilfs_btree_free_path(btree, path); 631 nilfs_btree_free_path(path);
677 return ret; 632 return ret;
678} 633}
679 634
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
685 do { 640 do {
686 lock_buffer(path[level].bp_bh); 641 lock_buffer(path[level].bp_bh);
687 nilfs_btree_node_set_key( 642 nilfs_btree_node_set_key(
688 btree, 643 nilfs_btree_get_nonroot_node(path, level),
689 nilfs_btree_get_nonroot_node(
690 btree, path, level),
691 path[level].bp_index, key); 644 path[level].bp_index, key);
692 if (!buffer_dirty(path[level].bp_bh)) 645 if (!buffer_dirty(path[level].bp_bh))
693 nilfs_btnode_mark_dirty(path[level].bp_bh); 646 nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
698 651
699 /* root */ 652 /* root */
700 if (level == nilfs_btree_height(btree) - 1) { 653 if (level == nilfs_btree_height(btree) - 1) {
701 nilfs_btree_node_set_key(btree, 654 nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
702 nilfs_btree_get_root(btree),
703 path[level].bp_index, key); 655 path[level].bp_index, key);
704 } 656 }
705} 657}
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
712 664
713 if (level < nilfs_btree_height(btree) - 1) { 665 if (level < nilfs_btree_height(btree) - 1) {
714 lock_buffer(path[level].bp_bh); 666 lock_buffer(path[level].bp_bh);
715 node = nilfs_btree_get_nonroot_node(btree, path, level); 667 node = nilfs_btree_get_nonroot_node(path, level);
716 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 668 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
717 path[level].bp_index); 669 path[level].bp_index);
718 if (!buffer_dirty(path[level].bp_bh)) 670 if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
721 673
722 if (path[level].bp_index == 0) 674 if (path[level].bp_index == 0)
723 nilfs_btree_promote_key(btree, path, level + 1, 675 nilfs_btree_promote_key(btree, path, level + 1,
724 nilfs_btree_node_get_key( 676 nilfs_btree_node_get_key(node,
725 btree, node, 0)); 677 0));
726 } else { 678 } else {
727 node = nilfs_btree_get_root(btree); 679 node = nilfs_btree_get_root(btree);
728 nilfs_btree_node_insert(btree, node, *keyp, *ptrp, 680 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
740 lock_buffer(path[level].bp_bh); 692 lock_buffer(path[level].bp_bh);
741 lock_buffer(path[level].bp_sib_bh); 693 lock_buffer(path[level].bp_sib_bh);
742 694
743 node = nilfs_btree_get_nonroot_node(btree, path, level); 695 node = nilfs_btree_get_nonroot_node(path, level);
744 left = nilfs_btree_get_sib_node(btree, path, level); 696 left = nilfs_btree_get_sib_node(path, level);
745 nchildren = nilfs_btree_node_get_nchildren(btree, node); 697 nchildren = nilfs_btree_node_get_nchildren(node);
746 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 698 lnchildren = nilfs_btree_node_get_nchildren(left);
747 move = 0; 699 move = 0;
748 700
749 n = (nchildren + lnchildren + 1) / 2 - lnchildren; 701 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
764 unlock_buffer(path[level].bp_sib_bh); 716 unlock_buffer(path[level].bp_sib_bh);
765 717
766 nilfs_btree_promote_key(btree, path, level + 1, 718 nilfs_btree_promote_key(btree, path, level + 1,
767 nilfs_btree_node_get_key(btree, node, 0)); 719 nilfs_btree_node_get_key(node, 0));
768 720
769 if (move) { 721 if (move) {
770 brelse(path[level].bp_bh); 722 brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
791 lock_buffer(path[level].bp_bh); 743 lock_buffer(path[level].bp_bh);
792 lock_buffer(path[level].bp_sib_bh); 744 lock_buffer(path[level].bp_sib_bh);
793 745
794 node = nilfs_btree_get_nonroot_node(btree, path, level); 746 node = nilfs_btree_get_nonroot_node(path, level);
795 right = nilfs_btree_get_sib_node(btree, path, level); 747 right = nilfs_btree_get_sib_node(path, level);
796 nchildren = nilfs_btree_node_get_nchildren(btree, node); 748 nchildren = nilfs_btree_node_get_nchildren(node);
797 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 749 rnchildren = nilfs_btree_node_get_nchildren(right);
798 move = 0; 750 move = 0;
799 751
800 n = (nchildren + rnchildren + 1) / 2 - rnchildren; 752 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
816 768
817 path[level + 1].bp_index++; 769 path[level + 1].bp_index++;
818 nilfs_btree_promote_key(btree, path, level + 1, 770 nilfs_btree_promote_key(btree, path, level + 1,
819 nilfs_btree_node_get_key(btree, right, 0)); 771 nilfs_btree_node_get_key(right, 0));
820 path[level + 1].bp_index--; 772 path[level + 1].bp_index--;
821 773
822 if (move) { 774 if (move) {
823 brelse(path[level].bp_bh); 775 brelse(path[level].bp_bh);
824 path[level].bp_bh = path[level].bp_sib_bh; 776 path[level].bp_bh = path[level].bp_sib_bh;
825 path[level].bp_sib_bh = NULL; 777 path[level].bp_sib_bh = NULL;
826 path[level].bp_index -= 778 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
827 nilfs_btree_node_get_nchildren(btree, node);
828 path[level + 1].bp_index++; 779 path[level + 1].bp_index++;
829 } else { 780 } else {
830 brelse(path[level].bp_sib_bh); 781 brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
846 lock_buffer(path[level].bp_bh); 797 lock_buffer(path[level].bp_bh);
847 lock_buffer(path[level].bp_sib_bh); 798 lock_buffer(path[level].bp_sib_bh);
848 799
849 node = nilfs_btree_get_nonroot_node(btree, path, level); 800 node = nilfs_btree_get_nonroot_node(path, level);
850 right = nilfs_btree_get_sib_node(btree, path, level); 801 right = nilfs_btree_get_sib_node(path, level);
851 nchildren = nilfs_btree_node_get_nchildren(btree, node); 802 nchildren = nilfs_btree_node_get_nchildren(node);
852 move = 0; 803 move = 0;
853 804
854 n = (nchildren + 1) / 2; 805 n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
867 unlock_buffer(path[level].bp_bh); 818 unlock_buffer(path[level].bp_bh);
868 unlock_buffer(path[level].bp_sib_bh); 819 unlock_buffer(path[level].bp_sib_bh);
869 820
870 newkey = nilfs_btree_node_get_key(btree, right, 0); 821 newkey = nilfs_btree_node_get_key(right, 0);
871 newptr = path[level].bp_newreq.bpr_ptr; 822 newptr = path[level].bp_newreq.bpr_ptr;
872 823
873 if (move) { 824 if (move) {
874 path[level].bp_index -= 825 path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
875 nilfs_btree_node_get_nchildren(btree, node);
876 nilfs_btree_node_insert(btree, right, *keyp, *ptrp, 826 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
877 path[level].bp_index); 827 path[level].bp_index);
878 828
879 *keyp = nilfs_btree_node_get_key(btree, right, 0); 829 *keyp = nilfs_btree_node_get_key(right, 0);
880 *ptrp = path[level].bp_newreq.bpr_ptr; 830 *ptrp = path[level].bp_newreq.bpr_ptr;
881 831
882 brelse(path[level].bp_bh); 832 brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
885 } else { 835 } else {
886 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 836 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
887 837
888 *keyp = nilfs_btree_node_get_key(btree, right, 0); 838 *keyp = nilfs_btree_node_get_key(right, 0);
889 *ptrp = path[level].bp_newreq.bpr_ptr; 839 *ptrp = path[level].bp_newreq.bpr_ptr;
890 840
891 brelse(path[level].bp_sib_bh); 841 brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
905 lock_buffer(path[level].bp_sib_bh); 855 lock_buffer(path[level].bp_sib_bh);
906 856
907 root = nilfs_btree_get_root(btree); 857 root = nilfs_btree_get_root(btree);
908 child = nilfs_btree_get_sib_node(btree, path, level); 858 child = nilfs_btree_get_sib_node(path, level);
909 859
910 n = nilfs_btree_node_get_nchildren(btree, root); 860 n = nilfs_btree_node_get_nchildren(root);
911 861
912 nilfs_btree_node_move_right(btree, root, child, n); 862 nilfs_btree_node_move_right(btree, root, child, n);
913 nilfs_btree_node_set_level(btree, root, level + 1); 863 nilfs_btree_node_set_level(root, level + 1);
914 864
915 if (!buffer_dirty(path[level].bp_sib_bh)) 865 if (!buffer_dirty(path[level].bp_sib_bh))
916 nilfs_btnode_mark_dirty(path[level].bp_sib_bh); 866 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
922 872
923 nilfs_btree_do_insert(btree, path, level, keyp, ptrp); 873 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
924 874
925 *keyp = nilfs_btree_node_get_key(btree, child, 0); 875 *keyp = nilfs_btree_node_get_key(child, 0);
926 *ptrp = path[level].bp_newreq.bpr_ptr; 876 *ptrp = path[level].bp_newreq.bpr_ptr;
927} 877}
928 878
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
990 struct nilfs_btree_node *node, *parent, *sib; 940 struct nilfs_btree_node *node, *parent, *sib;
991 __u64 sibptr; 941 __u64 sibptr;
992 int pindex, level, ret; 942 int pindex, level, ret;
943 struct inode *dat = NULL;
993 944
994 stats->bs_nblocks = 0; 945 stats->bs_nblocks = 0;
995 level = NILFS_BTREE_LEVEL_DATA; 946 level = NILFS_BTREE_LEVEL_DATA;
996 947
997 /* allocate a new ptr for data block */ 948 /* allocate a new ptr for data block */
998 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 949 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
999 path[level].bp_newreq.bpr_ptr = 950 path[level].bp_newreq.bpr_ptr =
1000 nilfs_btree_find_target_v(btree, path, key); 951 nilfs_btree_find_target_v(btree, path, key);
952 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
953 }
1001 954
1002 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 955 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1003 &path[level].bp_newreq); 956 &path[level].bp_newreq, dat);
1004 if (ret < 0) 957 if (ret < 0)
1005 goto err_out_data; 958 goto err_out_data;
1006 959
1007 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 960 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1008 level < nilfs_btree_height(btree) - 1; 961 level < nilfs_btree_height(btree) - 1;
1009 level++) { 962 level++) {
1010 node = nilfs_btree_get_nonroot_node(btree, path, level); 963 node = nilfs_btree_get_nonroot_node(path, level);
1011 if (nilfs_btree_node_get_nchildren(btree, node) < 964 if (nilfs_btree_node_get_nchildren(node) <
1012 nilfs_btree_node_nchildren_max(btree, node)) { 965 nilfs_btree_node_nchildren_max(node, btree)) {
1013 path[level].bp_op = nilfs_btree_do_insert; 966 path[level].bp_op = nilfs_btree_do_insert;
1014 stats->bs_nblocks++; 967 stats->bs_nblocks++;
1015 goto out; 968 goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1026 if (ret < 0) 979 if (ret < 0)
1027 goto err_out_child_node; 980 goto err_out_child_node;
1028 sib = (struct nilfs_btree_node *)bh->b_data; 981 sib = (struct nilfs_btree_node *)bh->b_data;
1029 if (nilfs_btree_node_get_nchildren(btree, sib) < 982 if (nilfs_btree_node_get_nchildren(sib) <
1030 nilfs_btree_node_nchildren_max(btree, sib)) { 983 nilfs_btree_node_nchildren_max(sib, btree)) {
1031 path[level].bp_sib_bh = bh; 984 path[level].bp_sib_bh = bh;
1032 path[level].bp_op = nilfs_btree_carry_left; 985 path[level].bp_op = nilfs_btree_carry_left;
1033 stats->bs_nblocks++; 986 stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1038 991
1039 /* right sibling */ 992 /* right sibling */
1040 if (pindex < 993 if (pindex <
1041 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 994 nilfs_btree_node_get_nchildren(parent) - 1) {
1042 sibptr = nilfs_btree_node_get_ptr(btree, parent, 995 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1043 pindex + 1); 996 pindex + 1);
1044 ret = nilfs_btree_get_block(btree, sibptr, &bh); 997 ret = nilfs_btree_get_block(btree, sibptr, &bh);
1045 if (ret < 0) 998 if (ret < 0)
1046 goto err_out_child_node; 999 goto err_out_child_node;
1047 sib = (struct nilfs_btree_node *)bh->b_data; 1000 sib = (struct nilfs_btree_node *)bh->b_data;
1048 if (nilfs_btree_node_get_nchildren(btree, sib) < 1001 if (nilfs_btree_node_get_nchildren(sib) <
1049 nilfs_btree_node_nchildren_max(btree, sib)) { 1002 nilfs_btree_node_nchildren_max(sib, btree)) {
1050 path[level].bp_sib_bh = bh; 1003 path[level].bp_sib_bh = bh;
1051 path[level].bp_op = nilfs_btree_carry_right; 1004 path[level].bp_op = nilfs_btree_carry_right;
1052 stats->bs_nblocks++; 1005 stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1059 path[level].bp_newreq.bpr_ptr = 1012 path[level].bp_newreq.bpr_ptr =
1060 path[level - 1].bp_newreq.bpr_ptr + 1; 1013 path[level - 1].bp_newreq.bpr_ptr + 1;
1061 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1014 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1062 &path[level].bp_newreq); 1015 &path[level].bp_newreq, dat);
1063 if (ret < 0) 1016 if (ret < 0)
1064 goto err_out_child_node; 1017 goto err_out_child_node;
1065 ret = nilfs_btree_get_new_block(btree, 1018 ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1081 1034
1082 /* root */ 1035 /* root */
1083 node = nilfs_btree_get_root(btree); 1036 node = nilfs_btree_get_root(btree);
1084 if (nilfs_btree_node_get_nchildren(btree, node) < 1037 if (nilfs_btree_node_get_nchildren(node) <
1085 nilfs_btree_node_nchildren_max(btree, node)) { 1038 nilfs_btree_node_nchildren_max(node, btree)) {
1086 path[level].bp_op = nilfs_btree_do_insert; 1039 path[level].bp_op = nilfs_btree_do_insert;
1087 stats->bs_nblocks++; 1040 stats->bs_nblocks++;
1088 goto out; 1041 goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1091 /* grow */ 1044 /* grow */
1092 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; 1045 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
1093 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, 1046 ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
1094 &path[level].bp_newreq); 1047 &path[level].bp_newreq, dat);
1095 if (ret < 0) 1048 if (ret < 0)
1096 goto err_out_child_node; 1049 goto err_out_child_node;
1097 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, 1050 ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
1119 1072
1120 /* error */ 1073 /* error */
1121 err_out_curr_node: 1074 err_out_curr_node:
1122 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1075 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1076 dat);
1123 err_out_child_node: 1077 err_out_child_node:
1124 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { 1078 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1125 nilfs_btnode_delete(path[level].bp_sib_bh); 1079 nilfs_btnode_delete(path[level].bp_sib_bh);
1126 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, 1080 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
1127 &path[level].bp_newreq); 1081 &path[level].bp_newreq, dat);
1128 1082
1129 } 1083 }
1130 1084
1131 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); 1085 nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
1086 dat);
1132 err_out_data: 1087 err_out_data:
1133 *levelp = level; 1088 *levelp = level;
1134 stats->bs_nblocks = 0; 1089 stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1139 struct nilfs_btree_path *path, 1094 struct nilfs_btree_path *path,
1140 int maxlevel, __u64 key, __u64 ptr) 1095 int maxlevel, __u64 key, __u64 ptr)
1141{ 1096{
1097 struct inode *dat = NULL;
1142 int level; 1098 int level;
1143 1099
1144 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1100 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1145 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; 1101 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1146 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) 1102 if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
1147 nilfs_btree_set_target_v(btree, key, ptr); 1103 nilfs_btree_set_target_v(btree, key, ptr);
1104 dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1105 }
1148 1106
1149 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1107 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1150 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, 1108 nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
1151 &path[level - 1].bp_newreq); 1109 &path[level - 1].bp_newreq, dat);
1152 path[level].bp_op(btree, path, level, &key, &ptr); 1110 path[level].bp_op(btree, path, level, &key, &ptr);
1153 } 1111 }
1154 1112
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1164 int level, ret; 1122 int level, ret;
1165 1123
1166 btree = (struct nilfs_btree *)bmap; 1124 btree = (struct nilfs_btree *)bmap;
1167 path = nilfs_btree_alloc_path(btree); 1125 path = nilfs_btree_alloc_path();
1168 if (path == NULL) 1126 if (path == NULL)
1169 return -ENOMEM; 1127 return -ENOMEM;
1170 nilfs_btree_init_path(btree, path); 1128 nilfs_btree_init_path(path);
1171 1129
1172 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1130 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1173 NILFS_BTREE_LEVEL_NODE_MIN); 1131 NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1184 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 1142 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1185 1143
1186 out: 1144 out:
1187 nilfs_btree_clear_path(btree, path); 1145 nilfs_btree_release_path(path);
1188 nilfs_btree_free_path(btree, path); 1146 nilfs_btree_free_path(path);
1189 return ret; 1147 return ret;
1190} 1148}
1191 1149
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1197 1155
1198 if (level < nilfs_btree_height(btree) - 1) { 1156 if (level < nilfs_btree_height(btree) - 1) {
1199 lock_buffer(path[level].bp_bh); 1157 lock_buffer(path[level].bp_bh);
1200 node = nilfs_btree_get_nonroot_node(btree, path, level); 1158 node = nilfs_btree_get_nonroot_node(path, level);
1201 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1159 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1202 path[level].bp_index); 1160 path[level].bp_index);
1203 if (!buffer_dirty(path[level].bp_bh)) 1161 if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1205 unlock_buffer(path[level].bp_bh); 1163 unlock_buffer(path[level].bp_bh);
1206 if (path[level].bp_index == 0) 1164 if (path[level].bp_index == 0)
1207 nilfs_btree_promote_key(btree, path, level + 1, 1165 nilfs_btree_promote_key(btree, path, level + 1,
1208 nilfs_btree_node_get_key(btree, node, 0)); 1166 nilfs_btree_node_get_key(node, 0));
1209 } else { 1167 } else {
1210 node = nilfs_btree_get_root(btree); 1168 node = nilfs_btree_get_root(btree);
1211 nilfs_btree_node_delete(btree, node, keyp, ptrp, 1169 nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1225 lock_buffer(path[level].bp_bh); 1183 lock_buffer(path[level].bp_bh);
1226 lock_buffer(path[level].bp_sib_bh); 1184 lock_buffer(path[level].bp_sib_bh);
1227 1185
1228 node = nilfs_btree_get_nonroot_node(btree, path, level); 1186 node = nilfs_btree_get_nonroot_node(path, level);
1229 left = nilfs_btree_get_sib_node(btree, path, level); 1187 left = nilfs_btree_get_sib_node(path, level);
1230 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1188 nchildren = nilfs_btree_node_get_nchildren(node);
1231 lnchildren = nilfs_btree_node_get_nchildren(btree, left); 1189 lnchildren = nilfs_btree_node_get_nchildren(left);
1232 1190
1233 n = (nchildren + lnchildren) / 2 - nchildren; 1191 n = (nchildren + lnchildren) / 2 - nchildren;
1234 1192
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1243 unlock_buffer(path[level].bp_sib_bh); 1201 unlock_buffer(path[level].bp_sib_bh);
1244 1202
1245 nilfs_btree_promote_key(btree, path, level + 1, 1203 nilfs_btree_promote_key(btree, path, level + 1,
1246 nilfs_btree_node_get_key(btree, node, 0)); 1204 nilfs_btree_node_get_key(node, 0));
1247 1205
1248 brelse(path[level].bp_sib_bh); 1206 brelse(path[level].bp_sib_bh);
1249 path[level].bp_sib_bh = NULL; 1207 path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1262 lock_buffer(path[level].bp_bh); 1220 lock_buffer(path[level].bp_bh);
1263 lock_buffer(path[level].bp_sib_bh); 1221 lock_buffer(path[level].bp_sib_bh);
1264 1222
1265 node = nilfs_btree_get_nonroot_node(btree, path, level); 1223 node = nilfs_btree_get_nonroot_node(path, level);
1266 right = nilfs_btree_get_sib_node(btree, path, level); 1224 right = nilfs_btree_get_sib_node(path, level);
1267 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1225 nchildren = nilfs_btree_node_get_nchildren(node);
1268 rnchildren = nilfs_btree_node_get_nchildren(btree, right); 1226 rnchildren = nilfs_btree_node_get_nchildren(right);
1269 1227
1270 n = (nchildren + rnchildren) / 2 - nchildren; 1228 n = (nchildren + rnchildren) / 2 - nchildren;
1271 1229
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1281 1239
1282 path[level + 1].bp_index++; 1240 path[level + 1].bp_index++;
1283 nilfs_btree_promote_key(btree, path, level + 1, 1241 nilfs_btree_promote_key(btree, path, level + 1,
1284 nilfs_btree_node_get_key(btree, right, 0)); 1242 nilfs_btree_node_get_key(right, 0));
1285 path[level + 1].bp_index--; 1243 path[level + 1].bp_index--;
1286 1244
1287 brelse(path[level].bp_sib_bh); 1245 brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1300 lock_buffer(path[level].bp_bh); 1258 lock_buffer(path[level].bp_bh);
1301 lock_buffer(path[level].bp_sib_bh); 1259 lock_buffer(path[level].bp_sib_bh);
1302 1260
1303 node = nilfs_btree_get_nonroot_node(btree, path, level); 1261 node = nilfs_btree_get_nonroot_node(path, level);
1304 left = nilfs_btree_get_sib_node(btree, path, level); 1262 left = nilfs_btree_get_sib_node(path, level);
1305 1263
1306 n = nilfs_btree_node_get_nchildren(btree, node); 1264 n = nilfs_btree_node_get_nchildren(node);
1307 1265
1308 nilfs_btree_node_move_left(btree, left, node, n); 1266 nilfs_btree_node_move_left(btree, left, node, n);
1309 1267
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1316 nilfs_btnode_delete(path[level].bp_bh); 1274 nilfs_btnode_delete(path[level].bp_bh);
1317 path[level].bp_bh = path[level].bp_sib_bh; 1275 path[level].bp_bh = path[level].bp_sib_bh;
1318 path[level].bp_sib_bh = NULL; 1276 path[level].bp_sib_bh = NULL;
1319 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); 1277 path[level].bp_index += nilfs_btree_node_get_nchildren(left);
1320} 1278}
1321 1279
1322static void nilfs_btree_concat_right(struct nilfs_btree *btree, 1280static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1331 lock_buffer(path[level].bp_bh); 1289 lock_buffer(path[level].bp_bh);
1332 lock_buffer(path[level].bp_sib_bh); 1290 lock_buffer(path[level].bp_sib_bh);
1333 1291
1334 node = nilfs_btree_get_nonroot_node(btree, path, level); 1292 node = nilfs_btree_get_nonroot_node(path, level);
1335 right = nilfs_btree_get_sib_node(btree, path, level); 1293 right = nilfs_btree_get_sib_node(path, level);
1336 1294
1337 n = nilfs_btree_node_get_nchildren(btree, right); 1295 n = nilfs_btree_node_get_nchildren(right);
1338 1296
1339 nilfs_btree_node_move_left(btree, node, right, n); 1297 nilfs_btree_node_move_left(btree, node, right, n);
1340 1298
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1360 1318
1361 lock_buffer(path[level].bp_bh); 1319 lock_buffer(path[level].bp_bh);
1362 root = nilfs_btree_get_root(btree); 1320 root = nilfs_btree_get_root(btree);
1363 child = nilfs_btree_get_nonroot_node(btree, path, level); 1321 child = nilfs_btree_get_nonroot_node(path, level);
1364 1322
1365 nilfs_btree_node_delete(btree, root, NULL, NULL, 0); 1323 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1366 nilfs_btree_node_set_level(btree, root, level); 1324 nilfs_btree_node_set_level(root, level);
1367 n = nilfs_btree_node_get_nchildren(btree, child); 1325 n = nilfs_btree_node_get_nchildren(child);
1368 nilfs_btree_node_move_left(btree, root, child, n); 1326 nilfs_btree_node_move_left(btree, root, child, n);
1369 unlock_buffer(path[level].bp_bh); 1327 unlock_buffer(path[level].bp_bh);
1370 1328
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
1376static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, 1334static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1377 struct nilfs_btree_path *path, 1335 struct nilfs_btree_path *path,
1378 int *levelp, 1336 int *levelp,
1379 struct nilfs_bmap_stats *stats) 1337 struct nilfs_bmap_stats *stats,
1338 struct inode *dat)
1380{ 1339{
1381 struct buffer_head *bh; 1340 struct buffer_head *bh;
1382 struct nilfs_btree_node *node, *parent, *sib; 1341 struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1388 for (level = NILFS_BTREE_LEVEL_NODE_MIN; 1347 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1389 level < nilfs_btree_height(btree) - 1; 1348 level < nilfs_btree_height(btree) - 1;
1390 level++) { 1349 level++) {
1391 node = nilfs_btree_get_nonroot_node(btree, path, level); 1350 node = nilfs_btree_get_nonroot_node(path, level);
1392 path[level].bp_oldreq.bpr_ptr = 1351 path[level].bp_oldreq.bpr_ptr =
1393 nilfs_btree_node_get_ptr(btree, node, 1352 nilfs_btree_node_get_ptr(btree, node,
1394 path[level].bp_index); 1353 path[level].bp_index);
1395 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1354 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1396 &path[level].bp_oldreq); 1355 &path[level].bp_oldreq, dat);
1397 if (ret < 0) 1356 if (ret < 0)
1398 goto err_out_child_node; 1357 goto err_out_child_node;
1399 1358
1400 if (nilfs_btree_node_get_nchildren(btree, node) > 1359 if (nilfs_btree_node_get_nchildren(node) >
1401 nilfs_btree_node_nchildren_min(btree, node)) { 1360 nilfs_btree_node_nchildren_min(node, btree)) {
1402 path[level].bp_op = nilfs_btree_do_delete; 1361 path[level].bp_op = nilfs_btree_do_delete;
1403 stats->bs_nblocks++; 1362 stats->bs_nblocks++;
1404 goto out; 1363 goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1415 if (ret < 0) 1374 if (ret < 0)
1416 goto err_out_curr_node; 1375 goto err_out_curr_node;
1417 sib = (struct nilfs_btree_node *)bh->b_data; 1376 sib = (struct nilfs_btree_node *)bh->b_data;
1418 if (nilfs_btree_node_get_nchildren(btree, sib) > 1377 if (nilfs_btree_node_get_nchildren(sib) >
1419 nilfs_btree_node_nchildren_min(btree, sib)) { 1378 nilfs_btree_node_nchildren_min(sib, btree)) {
1420 path[level].bp_sib_bh = bh; 1379 path[level].bp_sib_bh = bh;
1421 path[level].bp_op = nilfs_btree_borrow_left; 1380 path[level].bp_op = nilfs_btree_borrow_left;
1422 stats->bs_nblocks++; 1381 stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1428 /* continue; */ 1387 /* continue; */
1429 } 1388 }
1430 } else if (pindex < 1389 } else if (pindex <
1431 nilfs_btree_node_get_nchildren(btree, parent) - 1) { 1390 nilfs_btree_node_get_nchildren(parent) - 1) {
1432 /* right sibling */ 1391 /* right sibling */
1433 sibptr = nilfs_btree_node_get_ptr(btree, parent, 1392 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1434 pindex + 1); 1393 pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1436 if (ret < 0) 1395 if (ret < 0)
1437 goto err_out_curr_node; 1396 goto err_out_curr_node;
1438 sib = (struct nilfs_btree_node *)bh->b_data; 1397 sib = (struct nilfs_btree_node *)bh->b_data;
1439 if (nilfs_btree_node_get_nchildren(btree, sib) > 1398 if (nilfs_btree_node_get_nchildren(sib) >
1440 nilfs_btree_node_nchildren_min(btree, sib)) { 1399 nilfs_btree_node_nchildren_min(sib, btree)) {
1441 path[level].bp_sib_bh = bh; 1400 path[level].bp_sib_bh = bh;
1442 path[level].bp_op = nilfs_btree_borrow_right; 1401 path[level].bp_op = nilfs_btree_borrow_right;
1443 stats->bs_nblocks++; 1402 stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1452 /* no siblings */ 1411 /* no siblings */
1453 /* the only child of the root node */ 1412 /* the only child of the root node */
1454 WARN_ON(level != nilfs_btree_height(btree) - 2); 1413 WARN_ON(level != nilfs_btree_height(btree) - 2);
1455 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= 1414 if (nilfs_btree_node_get_nchildren(node) - 1 <=
1456 NILFS_BTREE_ROOT_NCHILDREN_MAX) { 1415 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1457 path[level].bp_op = nilfs_btree_shrink; 1416 path[level].bp_op = nilfs_btree_shrink;
1458 stats->bs_nblocks += 2; 1417 stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1471 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); 1430 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1472 1431
1473 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, 1432 ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
1474 &path[level].bp_oldreq); 1433 &path[level].bp_oldreq, dat);
1475 if (ret < 0) 1434 if (ret < 0)
1476 goto err_out_child_node; 1435 goto err_out_child_node;
1477 1436
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1486 1445
1487 /* error */ 1446 /* error */
1488 err_out_curr_node: 1447 err_out_curr_node:
1489 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); 1448 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
1490 err_out_child_node: 1449 err_out_child_node:
1491 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { 1450 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1492 brelse(path[level].bp_sib_bh); 1451 brelse(path[level].bp_sib_bh);
1493 nilfs_bmap_abort_end_ptr(&btree->bt_bmap, 1452 nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
1494 &path[level].bp_oldreq); 1453 &path[level].bp_oldreq, dat);
1495 } 1454 }
1496 *levelp = level; 1455 *levelp = level;
1497 stats->bs_nblocks = 0; 1456 stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1500 1459
1501static void nilfs_btree_commit_delete(struct nilfs_btree *btree, 1460static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1502 struct nilfs_btree_path *path, 1461 struct nilfs_btree_path *path,
1503 int maxlevel) 1462 int maxlevel, struct inode *dat)
1504{ 1463{
1505 int level; 1464 int level;
1506 1465
1507 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { 1466 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1508 nilfs_bmap_commit_end_ptr(&btree->bt_bmap, 1467 nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
1509 &path[level].bp_oldreq); 1468 &path[level].bp_oldreq, dat);
1510 path[level].bp_op(btree, path, level, NULL, NULL); 1469 path[level].bp_op(btree, path, level, NULL, NULL);
1511 } 1470 }
1512 1471
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1520 struct nilfs_btree *btree; 1479 struct nilfs_btree *btree;
1521 struct nilfs_btree_path *path; 1480 struct nilfs_btree_path *path;
1522 struct nilfs_bmap_stats stats; 1481 struct nilfs_bmap_stats stats;
1482 struct inode *dat;
1523 int level, ret; 1483 int level, ret;
1524 1484
1525 btree = (struct nilfs_btree *)bmap; 1485 btree = (struct nilfs_btree *)bmap;
1526 path = nilfs_btree_alloc_path(btree); 1486 path = nilfs_btree_alloc_path();
1527 if (path == NULL) 1487 if (path == NULL)
1528 return -ENOMEM; 1488 return -ENOMEM;
1529 nilfs_btree_init_path(btree, path); 1489 nilfs_btree_init_path(path);
1530 ret = nilfs_btree_do_lookup(btree, path, key, NULL, 1490 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1531 NILFS_BTREE_LEVEL_NODE_MIN); 1491 NILFS_BTREE_LEVEL_NODE_MIN);
1532 if (ret < 0) 1492 if (ret < 0)
1533 goto out; 1493 goto out;
1534 1494
1535 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); 1495
1496 dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
1497 nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
1498
1499 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
1536 if (ret < 0) 1500 if (ret < 0)
1537 goto out; 1501 goto out;
1538 nilfs_btree_commit_delete(btree, path, level); 1502 nilfs_btree_commit_delete(btree, path, level, dat);
1539 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); 1503 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1540 1504
1541out: 1505out:
1542 nilfs_btree_clear_path(btree, path); 1506 nilfs_btree_release_path(path);
1543 nilfs_btree_free_path(btree, path); 1507 nilfs_btree_free_path(path);
1544 return ret; 1508 return ret;
1545} 1509}
1546 1510
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1551 int ret; 1515 int ret;
1552 1516
1553 btree = (struct nilfs_btree *)bmap; 1517 btree = (struct nilfs_btree *)bmap;
1554 path = nilfs_btree_alloc_path(btree); 1518 path = nilfs_btree_alloc_path();
1555 if (path == NULL) 1519 if (path == NULL)
1556 return -ENOMEM; 1520 return -ENOMEM;
1557 nilfs_btree_init_path(btree, path); 1521 nilfs_btree_init_path(path);
1558 1522
1559 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); 1523 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1560 1524
1561 nilfs_btree_clear_path(btree, path); 1525 nilfs_btree_release_path(path);
1562 nilfs_btree_free_path(btree, path); 1526 nilfs_btree_free_path(path);
1563 1527
1564 return ret; 1528 return ret;
1565} 1529}
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1581 node = root; 1545 node = root;
1582 break; 1546 break;
1583 case 3: 1547 case 3:
1584 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1548 nchildren = nilfs_btree_node_get_nchildren(root);
1585 if (nchildren > 1) 1549 if (nchildren > 1)
1586 return 0; 1550 return 0;
1587 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1551 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1594 return 0; 1558 return 0;
1595 } 1559 }
1596 1560
1597 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1561 nchildren = nilfs_btree_node_get_nchildren(node);
1598 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); 1562 maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
1599 nextmaxkey = (nchildren > 1) ? 1563 nextmaxkey = (nchildren > 1) ?
1600 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; 1564 nilfs_btree_node_get_key(node, nchildren - 2) : 0;
1601 if (bh != NULL) 1565 if (bh != NULL)
1602 brelse(bh); 1566 brelse(bh);
1603 1567
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1623 node = root; 1587 node = root;
1624 break; 1588 break;
1625 case 3: 1589 case 3:
1626 nchildren = nilfs_btree_node_get_nchildren(btree, root); 1590 nchildren = nilfs_btree_node_get_nchildren(root);
1627 WARN_ON(nchildren > 1); 1591 WARN_ON(nchildren > 1);
1628 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); 1592 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1629 ret = nilfs_btree_get_block(btree, ptr, &bh); 1593 ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1636 return -EINVAL; 1600 return -EINVAL;
1637 } 1601 }
1638 1602
1639 nchildren = nilfs_btree_node_get_nchildren(btree, node); 1603 nchildren = nilfs_btree_node_get_nchildren(node);
1640 if (nchildren < nitems) 1604 if (nchildren < nitems)
1641 nitems = nchildren; 1605 nitems = nchildren;
1642 dkeys = nilfs_btree_node_dkeys(btree, node); 1606 dkeys = nilfs_btree_node_dkeys(node);
1643 dptrs = nilfs_btree_node_dptrs(btree, node); 1607 dptrs = nilfs_btree_node_dptrs(node, btree);
1644 for (i = 0; i < nitems; i++) { 1608 for (i = 0; i < nitems; i++) {
1645 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); 1609 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1646 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); 1610 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1660 struct nilfs_bmap_stats *stats) 1624 struct nilfs_bmap_stats *stats)
1661{ 1625{
1662 struct buffer_head *bh; 1626 struct buffer_head *bh;
1663 struct nilfs_btree *btree; 1627 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1628 struct inode *dat = NULL;
1664 int ret; 1629 int ret;
1665 1630
1666 btree = (struct nilfs_btree *)bmap;
1667 stats->bs_nblocks = 0; 1631 stats->bs_nblocks = 0;
1668 1632
1669 /* for data */ 1633 /* for data */
1670 /* cannot find near ptr */ 1634 /* cannot find near ptr */
1671 if (NILFS_BMAP_USE_VBN(bmap)) 1635 if (NILFS_BMAP_USE_VBN(bmap)) {
1672 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); 1636 dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
1637 dat = nilfs_bmap_get_dat(bmap);
1638 }
1673 1639
1674 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); 1640 ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
1675 if (ret < 0) 1641 if (ret < 0)
1676 return ret; 1642 return ret;
1677 1643
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1679 stats->bs_nblocks++; 1645 stats->bs_nblocks++;
1680 if (nreq != NULL) { 1646 if (nreq != NULL) {
1681 nreq->bpr_ptr = dreq->bpr_ptr + 1; 1647 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1682 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); 1648 ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
1683 if (ret < 0) 1649 if (ret < 0)
1684 goto err_out_dreq; 1650 goto err_out_dreq;
1685 1651
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1696 1662
1697 /* error */ 1663 /* error */
1698 err_out_nreq: 1664 err_out_nreq:
1699 nilfs_bmap_abort_alloc_ptr(bmap, nreq); 1665 nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
1700 err_out_dreq: 1666 err_out_dreq:
1701 nilfs_bmap_abort_alloc_ptr(bmap, dreq); 1667 nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
1702 stats->bs_nblocks = 0; 1668 stats->bs_nblocks = 0;
1703 return ret; 1669 return ret;
1704 1670
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1713 union nilfs_bmap_ptr_req *nreq, 1679 union nilfs_bmap_ptr_req *nreq,
1714 struct buffer_head *bh) 1680 struct buffer_head *bh)
1715{ 1681{
1716 struct nilfs_btree *btree; 1682 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1717 struct nilfs_btree_node *node; 1683 struct nilfs_btree_node *node;
1684 struct inode *dat;
1718 __u64 tmpptr; 1685 __u64 tmpptr;
1719 1686
1720 /* free resources */ 1687 /* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1725 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); 1692 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1726 1693
1727 /* convert and insert */ 1694 /* convert and insert */
1728 btree = (struct nilfs_btree *)bmap; 1695 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
1729 nilfs_btree_init(bmap); 1696 nilfs_btree_init(bmap);
1730 if (nreq != NULL) { 1697 if (nreq != NULL) {
1731 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1698 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1732 nilfs_bmap_commit_alloc_ptr(bmap, nreq); 1699 nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
1733 1700
1734 /* create child node at level 1 */ 1701 /* create child node at level 1 */
1735 lock_buffer(bh); 1702 lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1751 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 1718 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1752 2, 1, &keys[0], &tmpptr); 1719 2, 1, &keys[0], &tmpptr);
1753 } else { 1720 } else {
1754 nilfs_bmap_commit_alloc_ptr(bmap, dreq); 1721 nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
1755 1722
1756 /* create root node at level 1 */ 1723 /* create root node at level 1 */
1757 node = nilfs_btree_get_root(btree); 1724 node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1822 1789
1823static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, 1790static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1824 struct nilfs_btree_path *path, 1791 struct nilfs_btree_path *path,
1825 int level) 1792 int level, struct inode *dat)
1826{ 1793{
1827 struct nilfs_btree_node *parent; 1794 struct nilfs_btree_node *parent;
1828 int ret; 1795 int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1832 nilfs_btree_node_get_ptr(btree, parent, 1799 nilfs_btree_node_get_ptr(btree, parent,
1833 path[level + 1].bp_index); 1800 path[level + 1].bp_index);
1834 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; 1801 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1835 ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, 1802 ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
1836 &path[level].bp_oldreq, 1803 &path[level].bp_newreq.bpr_req);
1837 &path[level].bp_newreq);
1838 if (ret < 0) 1804 if (ret < 0)
1839 return ret; 1805 return ret;
1840 1806
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1846 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1812 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1847 &path[level].bp_ctxt); 1813 &path[level].bp_ctxt);
1848 if (ret < 0) { 1814 if (ret < 0) {
1849 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1815 nilfs_dat_abort_update(dat,
1850 &path[level].bp_oldreq, 1816 &path[level].bp_oldreq.bpr_req,
1851 &path[level].bp_newreq); 1817 &path[level].bp_newreq.bpr_req);
1852 return ret; 1818 return ret;
1853 } 1819 }
1854 } 1820 }
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1858 1824
1859static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, 1825static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1860 struct nilfs_btree_path *path, 1826 struct nilfs_btree_path *path,
1861 int level) 1827 int level, struct inode *dat)
1862{ 1828{
1863 struct nilfs_btree_node *parent; 1829 struct nilfs_btree_node *parent;
1864 1830
1865 nilfs_bmap_commit_update_v(&btree->bt_bmap, 1831 nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
1866 &path[level].bp_oldreq, 1832 &path[level].bp_newreq.bpr_req,
1867 &path[level].bp_newreq); 1833 btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
1868 1834
1869 if (buffer_nilfs_node(path[level].bp_bh)) { 1835 if (buffer_nilfs_node(path[level].bp_bh)) {
1870 nilfs_btnode_commit_change_key( 1836 nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1881 1847
1882static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, 1848static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1883 struct nilfs_btree_path *path, 1849 struct nilfs_btree_path *path,
1884 int level) 1850 int level, struct inode *dat)
1885{ 1851{
1886 nilfs_bmap_abort_update_v(&btree->bt_bmap, 1852 nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
1887 &path[level].bp_oldreq, 1853 &path[level].bp_newreq.bpr_req);
1888 &path[level].bp_newreq);
1889 if (buffer_nilfs_node(path[level].bp_bh)) 1854 if (buffer_nilfs_node(path[level].bp_bh))
1890 nilfs_btnode_abort_change_key( 1855 nilfs_btnode_abort_change_key(
1891 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, 1856 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1894 1859
1895static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, 1860static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1896 struct nilfs_btree_path *path, 1861 struct nilfs_btree_path *path,
1897 int minlevel, 1862 int minlevel, int *maxlevelp,
1898 int *maxlevelp) 1863 struct inode *dat)
1899{ 1864{
1900 int level, ret; 1865 int level, ret;
1901 1866
1902 level = minlevel; 1867 level = minlevel;
1903 if (!buffer_nilfs_volatile(path[level].bp_bh)) { 1868 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1904 ret = nilfs_btree_prepare_update_v(btree, path, level); 1869 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1905 if (ret < 0) 1870 if (ret < 0)
1906 return ret; 1871 return ret;
1907 } 1872 }
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1909 !buffer_dirty(path[level].bp_bh)) { 1874 !buffer_dirty(path[level].bp_bh)) {
1910 1875
1911 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); 1876 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1912 ret = nilfs_btree_prepare_update_v(btree, path, level); 1877 ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
1913 if (ret < 0) 1878 if (ret < 0)
1914 goto out; 1879 goto out;
1915 } 1880 }
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1921 /* error */ 1886 /* error */
1922 out: 1887 out:
1923 while (--level > minlevel) 1888 while (--level > minlevel)
1924 nilfs_btree_abort_update_v(btree, path, level); 1889 nilfs_btree_abort_update_v(btree, path, level, dat);
1925 if (!buffer_nilfs_volatile(path[level].bp_bh)) 1890 if (!buffer_nilfs_volatile(path[level].bp_bh))
1926 nilfs_btree_abort_update_v(btree, path, level); 1891 nilfs_btree_abort_update_v(btree, path, level, dat);
1927 return ret; 1892 return ret;
1928} 1893}
1929 1894
1930static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, 1895static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1931 struct nilfs_btree_path *path, 1896 struct nilfs_btree_path *path,
1932 int minlevel, 1897 int minlevel, int maxlevel,
1933 int maxlevel, 1898 struct buffer_head *bh,
1934 struct buffer_head *bh) 1899 struct inode *dat)
1935{ 1900{
1936 int level; 1901 int level;
1937 1902
1938 if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) 1903 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1939 nilfs_btree_commit_update_v(btree, path, minlevel); 1904 nilfs_btree_commit_update_v(btree, path, minlevel, dat);
1940 1905
1941 for (level = minlevel + 1; level <= maxlevel; level++) 1906 for (level = minlevel + 1; level <= maxlevel; level++)
1942 nilfs_btree_commit_update_v(btree, path, level); 1907 nilfs_btree_commit_update_v(btree, path, level, dat);
1943} 1908}
1944 1909
1945static int nilfs_btree_propagate_v(struct nilfs_btree *btree, 1910static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1946 struct nilfs_btree_path *path, 1911 struct nilfs_btree_path *path,
1947 int level, 1912 int level, struct buffer_head *bh)
1948 struct buffer_head *bh)
1949{ 1913{
1950 int maxlevel, ret; 1914 int maxlevel, ret;
1951 struct nilfs_btree_node *parent; 1915 struct nilfs_btree_node *parent;
1916 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
1952 __u64 ptr; 1917 __u64 ptr;
1953 1918
1954 get_bh(bh); 1919 get_bh(bh);
1955 path[level].bp_bh = bh; 1920 path[level].bp_bh = bh;
1956 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); 1921 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
1922 dat);
1957 if (ret < 0) 1923 if (ret < 0)
1958 goto out; 1924 goto out;
1959 1925
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1961 parent = nilfs_btree_get_node(btree, path, level + 1); 1927 parent = nilfs_btree_get_node(btree, path, level + 1);
1962 ptr = nilfs_btree_node_get_ptr(btree, parent, 1928 ptr = nilfs_btree_node_get_ptr(btree, parent,
1963 path[level + 1].bp_index); 1929 path[level + 1].bp_index);
1964 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); 1930 ret = nilfs_dat_mark_dirty(dat, ptr);
1965 if (ret < 0) 1931 if (ret < 0)
1966 goto out; 1932 goto out;
1967 } 1933 }
1968 1934
1969 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); 1935 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
1970 1936
1971 out: 1937 out:
1972 brelse(path[level].bp_bh); 1938 brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1986 WARN_ON(!buffer_dirty(bh)); 1952 WARN_ON(!buffer_dirty(bh));
1987 1953
1988 btree = (struct nilfs_btree *)bmap; 1954 btree = (struct nilfs_btree *)bmap;
1989 path = nilfs_btree_alloc_path(btree); 1955 path = nilfs_btree_alloc_path();
1990 if (path == NULL) 1956 if (path == NULL)
1991 return -ENOMEM; 1957 return -ENOMEM;
1992 nilfs_btree_init_path(btree, path); 1958 nilfs_btree_init_path(path);
1993 1959
1994 if (buffer_nilfs_node(bh)) { 1960 if (buffer_nilfs_node(bh)) {
1995 node = (struct nilfs_btree_node *)bh->b_data; 1961 node = (struct nilfs_btree_node *)bh->b_data;
1996 key = nilfs_btree_node_get_key(btree, node, 0); 1962 key = nilfs_btree_node_get_key(node, 0);
1997 level = nilfs_btree_node_get_level(btree, node); 1963 level = nilfs_btree_node_get_level(node);
1998 } else { 1964 } else {
1999 key = nilfs_bmap_data_get_key(bmap, bh); 1965 key = nilfs_bmap_data_get_key(bmap, bh);
2000 level = NILFS_BTREE_LEVEL_DATA; 1966 level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2013 nilfs_btree_propagate_p(btree, path, level, bh); 1979 nilfs_btree_propagate_p(btree, path, level, bh);
2014 1980
2015 out: 1981 out:
2016 nilfs_btree_clear_path(btree, path); 1982 nilfs_btree_release_path(path);
2017 nilfs_btree_free_path(btree, path); 1983 nilfs_btree_free_path(path);
2018 1984
2019 return ret; 1985 return ret;
2020} 1986}
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
2022static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, 1988static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
2023 struct buffer_head *bh) 1989 struct buffer_head *bh)
2024{ 1990{
2025 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); 1991 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
2026} 1992}
2027 1993
2028static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, 1994static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
2037 2003
2038 get_bh(bh); 2004 get_bh(bh);
2039 node = (struct nilfs_btree_node *)bh->b_data; 2005 node = (struct nilfs_btree_node *)bh->b_data;
2040 key = nilfs_btree_node_get_key(btree, node, 0); 2006 key = nilfs_btree_node_get_key(node, 0);
2041 level = nilfs_btree_node_get_level(btree, node); 2007 level = nilfs_btree_node_get_level(node);
2042 list_for_each(head, &lists[level]) { 2008 list_for_each(head, &lists[level]) {
2043 cbh = list_entry(head, struct buffer_head, b_assoc_buffers); 2009 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
2044 cnode = (struct nilfs_btree_node *)cbh->b_data; 2010 cnode = (struct nilfs_btree_node *)cbh->b_data;
2045 ckey = nilfs_btree_node_get_key(btree, cnode, 0); 2011 ckey = nilfs_btree_node_get_key(cnode, 0);
2046 if (key < ckey) 2012 if (key < ckey)
2047 break; 2013 break;
2048 } 2014 }
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2120 nilfs_btree_node_set_ptr(btree, parent, 2086 nilfs_btree_node_set_ptr(btree, parent,
2121 path[level + 1].bp_index, blocknr); 2087 path[level + 1].bp_index, blocknr);
2122 2088
2123 key = nilfs_btree_node_get_key(btree, parent, 2089 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2124 path[level + 1].bp_index);
2125 /* on-disk format */ 2090 /* on-disk format */
2126 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2091 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2127 binfo->bi_dat.bi_level = level; 2092 binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2137 union nilfs_binfo *binfo) 2102 union nilfs_binfo *binfo)
2138{ 2103{
2139 struct nilfs_btree_node *parent; 2104 struct nilfs_btree_node *parent;
2105 struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
2140 __u64 key; 2106 __u64 key;
2141 __u64 ptr; 2107 __u64 ptr;
2142 union nilfs_bmap_ptr_req req; 2108 union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2146 ptr = nilfs_btree_node_get_ptr(btree, parent, 2112 ptr = nilfs_btree_node_get_ptr(btree, parent,
2147 path[level + 1].bp_index); 2113 path[level + 1].bp_index);
2148 req.bpr_ptr = ptr; 2114 req.bpr_ptr = ptr;
2149 ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); 2115 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
2150 if (unlikely(ret < 0)) 2116 if (ret < 0)
2151 return ret; 2117 return ret;
2118 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
2152 2119
2153 key = nilfs_btree_node_get_key(btree, parent, 2120 key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
2154 path[level + 1].bp_index);
2155 /* on-disk format */ 2121 /* on-disk format */
2156 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 2122 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2157 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 2123 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2171 int level, ret; 2137 int level, ret;
2172 2138
2173 btree = (struct nilfs_btree *)bmap; 2139 btree = (struct nilfs_btree *)bmap;
2174 path = nilfs_btree_alloc_path(btree); 2140 path = nilfs_btree_alloc_path();
2175 if (path == NULL) 2141 if (path == NULL)
2176 return -ENOMEM; 2142 return -ENOMEM;
2177 nilfs_btree_init_path(btree, path); 2143 nilfs_btree_init_path(path);
2178 2144
2179 if (buffer_nilfs_node(*bh)) { 2145 if (buffer_nilfs_node(*bh)) {
2180 node = (struct nilfs_btree_node *)(*bh)->b_data; 2146 node = (struct nilfs_btree_node *)(*bh)->b_data;
2181 key = nilfs_btree_node_get_key(btree, node, 0); 2147 key = nilfs_btree_node_get_key(node, 0);
2182 level = nilfs_btree_node_get_level(btree, node); 2148 level = nilfs_btree_node_get_level(node);
2183 } else { 2149 } else {
2184 key = nilfs_bmap_data_get_key(bmap, *bh); 2150 key = nilfs_bmap_data_get_key(bmap, *bh);
2185 level = NILFS_BTREE_LEVEL_DATA; 2151 level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2196 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); 2162 nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
2197 2163
2198 out: 2164 out:
2199 nilfs_btree_clear_path(btree, path); 2165 nilfs_btree_release_path(path);
2200 nilfs_btree_free_path(btree, path); 2166 nilfs_btree_free_path(path);
2201 2167
2202 return ret; 2168 return ret;
2203} 2169}
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2207 sector_t blocknr, 2173 sector_t blocknr,
2208 union nilfs_binfo *binfo) 2174 union nilfs_binfo *binfo)
2209{ 2175{
2210 struct nilfs_btree *btree;
2211 struct nilfs_btree_node *node; 2176 struct nilfs_btree_node *node;
2212 __u64 key; 2177 __u64 key;
2213 int ret; 2178 int ret;
2214 2179
2215 btree = (struct nilfs_btree *)bmap; 2180 ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
2216 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); 2181 blocknr);
2217 if (ret < 0) 2182 if (ret < 0)
2218 return ret; 2183 return ret;
2219 2184
2220 if (buffer_nilfs_node(*bh)) { 2185 if (buffer_nilfs_node(*bh)) {
2221 node = (struct nilfs_btree_node *)(*bh)->b_data; 2186 node = (struct nilfs_btree_node *)(*bh)->b_data;
2222 key = nilfs_btree_node_get_key(btree, node, 0); 2187 key = nilfs_btree_node_get_key(node, 0);
2223 } else 2188 } else
2224 key = nilfs_bmap_data_get_key(bmap, *bh); 2189 key = nilfs_bmap_data_get_key(bmap, *bh);
2225 2190
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2239 int ret; 2204 int ret;
2240 2205
2241 btree = (struct nilfs_btree *)bmap; 2206 btree = (struct nilfs_btree *)bmap;
2242 path = nilfs_btree_alloc_path(btree); 2207 path = nilfs_btree_alloc_path();
2243 if (path == NULL) 2208 if (path == NULL)
2244 return -ENOMEM; 2209 return -ENOMEM;
2245 nilfs_btree_init_path(btree, path); 2210 nilfs_btree_init_path(path);
2246 2211
2247 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); 2212 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2248 if (ret < 0) { 2213 if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2262 nilfs_bmap_set_dirty(&btree->bt_bmap); 2227 nilfs_bmap_set_dirty(&btree->bt_bmap);
2263 2228
2264 out: 2229 out:
2265 nilfs_btree_clear_path(btree, path); 2230 nilfs_btree_release_path(path);
2266 nilfs_btree_free_path(btree, path); 2231 nilfs_btree_free_path(path);
2267 return ret; 2232 return ret;
2268} 2233}
2269 2234
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
815 void *kaddr; 815 void *kaddr;
816 int ret; 816 int ret;
817 817
818 if (cno == 0) 818 /* CP number is invalid if it's zero or larger than the
819 return -ENOENT; /* checkpoint number 0 is invalid */ 819 largest exist one.*/
820 if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
821 return -ENOENT;
820 down_read(&NILFS_MDT(cpfile)->mi_sem); 822 down_read(&NILFS_MDT(cpfile)->mi_sem);
821 823
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); 824 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
824 goto out; 826 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0); 827 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); 828 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp); 829 if (nilfs_checkpoint_invalid(cp))
830 ret = -ENOENT;
831 else
832 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0); 833 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh); 834 brelse(bh);
830 835
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29 29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32 30
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, 31int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **, 32 struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
109 nilfs_palloc_commit_free_entry(dat, req); 109 nilfs_palloc_commit_free_entry(dat, req);
110} 110}
111 111
112void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
113{
114 nilfs_dat_abort_entry(dat, req);
115 nilfs_palloc_abort_free_entry(dat, req);
116}
117
118int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) 112int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
119{ 113{
120 int ret; 114 int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
140 nilfs_dat_commit_entry(dat, req); 134 nilfs_dat_commit_entry(dat, req);
141} 135}
142 136
143void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
144{
145 nilfs_dat_abort_entry(dat, req);
146}
147
148int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) 137int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
149{ 138{
150 struct nilfs_dat_entry *entry; 139 struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
222 nilfs_dat_abort_entry(dat, req); 211 nilfs_dat_abort_entry(dat, req);
223} 212}
224 213
214int nilfs_dat_prepare_update(struct inode *dat,
215 struct nilfs_palloc_req *oldreq,
216 struct nilfs_palloc_req *newreq)
217{
218 int ret;
219
220 ret = nilfs_dat_prepare_end(dat, oldreq);
221 if (!ret) {
222 ret = nilfs_dat_prepare_alloc(dat, newreq);
223 if (ret < 0)
224 nilfs_dat_abort_end(dat, oldreq);
225 }
226 return ret;
227}
228
229void nilfs_dat_commit_update(struct inode *dat,
230 struct nilfs_palloc_req *oldreq,
231 struct nilfs_palloc_req *newreq, int dead)
232{
233 nilfs_dat_commit_end(dat, oldreq, dead);
234 nilfs_dat_commit_alloc(dat, newreq);
235}
236
237void nilfs_dat_abort_update(struct inode *dat,
238 struct nilfs_palloc_req *oldreq,
239 struct nilfs_palloc_req *newreq)
240{
241 nilfs_dat_abort_end(dat, oldreq);
242 nilfs_dat_abort_alloc(dat, newreq);
243}
244
225/** 245/**
226 * nilfs_dat_mark_dirty - 246 * nilfs_dat_mark_dirty -
227 * @dat: DAT file inode 247 * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
27#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
28#include <linux/fs.h> 28#include <linux/fs.h>
29 29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31 30
32struct nilfs_palloc_req; 31struct nilfs_palloc_req;
33 32
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); 38int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, 39void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t); 40 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); 41int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); 42void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); 43void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
44int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
45 struct nilfs_palloc_req *);
46void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
47 struct nilfs_palloc_req *, int);
48void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
49 struct nilfs_palloc_req *);
46 50
47int nilfs_dat_mark_dirty(struct inode *, __u64); 51int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t); 52int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
125 direct->d_bmap.b_last_allocated_ptr = ptr; 125 direct->d_bmap.b_last_allocated_ptr = ptr;
126} 126}
127 127
128static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
129 __u64 key,
130 union nilfs_bmap_ptr_req *req,
131 struct nilfs_bmap_stats *stats)
132{
133 int ret;
134
135 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
136 req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
137 ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
138 if (ret < 0)
139 return ret;
140
141 stats->bs_nblocks = 1;
142 return 0;
143}
144
145static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
146 union nilfs_bmap_ptr_req *req,
147 __u64 key, __u64 ptr)
148{
149 struct buffer_head *bh;
150
151 /* ptr must be a pointer to a buffer head. */
152 bh = (struct buffer_head *)((unsigned long)ptr);
153 set_buffer_nilfs_volatile(bh);
154
155 nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
156 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
157
158 if (!nilfs_bmap_dirty(&direct->d_bmap))
159 nilfs_bmap_set_dirty(&direct->d_bmap);
160
161 if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
162 nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
163}
164
165static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) 128static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
166{ 129{
167 struct nilfs_direct *direct; 130 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
168 union nilfs_bmap_ptr_req req; 131 union nilfs_bmap_ptr_req req;
169 struct nilfs_bmap_stats stats; 132 struct inode *dat = NULL;
133 struct buffer_head *bh;
170 int ret; 134 int ret;
171 135
172 direct = (struct nilfs_direct *)bmap;
173 if (key > NILFS_DIRECT_KEY_MAX) 136 if (key > NILFS_DIRECT_KEY_MAX)
174 return -ENOENT; 137 return -ENOENT;
175 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) 138 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
176 return -EEXIST; 139 return -EEXIST;
177 140
178 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); 141 if (NILFS_BMAP_USE_VBN(bmap)) {
179 if (ret < 0) 142 req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
180 return ret; 143 dat = nilfs_bmap_get_dat(bmap);
181 nilfs_direct_commit_insert(direct, &req, key, ptr); 144 }
182 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); 145 ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
146 if (!ret) {
147 /* ptr must be a pointer to a buffer head. */
148 bh = (struct buffer_head *)((unsigned long)ptr);
149 set_buffer_nilfs_volatile(bh);
183 150
184 return 0; 151 nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
185} 152 nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
186 153
187static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, 154 if (!nilfs_bmap_dirty(bmap))
188 union nilfs_bmap_ptr_req *req, 155 nilfs_bmap_set_dirty(bmap);
189 __u64 key,
190 struct nilfs_bmap_stats *stats)
191{
192 int ret;
193 156
194 req->bpr_ptr = nilfs_direct_get_ptr(direct, key); 157 if (NILFS_BMAP_USE_VBN(bmap))
195 ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); 158 nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
196 if (!ret)
197 stats->bs_nblocks = 1;
198 return ret;
199}
200 159
201static void nilfs_direct_commit_delete(struct nilfs_direct *direct, 160 nilfs_bmap_add_blocks(bmap, 1);
202 union nilfs_bmap_ptr_req *req, 161 }
203 __u64 key) 162 return ret;
204{
205 nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
206 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
207} 163}
208 164
209static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) 165static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
210{ 166{
211 struct nilfs_direct *direct; 167 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
212 union nilfs_bmap_ptr_req req; 168 union nilfs_bmap_ptr_req req;
213 struct nilfs_bmap_stats stats; 169 struct inode *dat;
214 int ret; 170 int ret;
215 171
216 direct = (struct nilfs_direct *)bmap; 172 if (key > NILFS_DIRECT_KEY_MAX ||
217 if ((key > NILFS_DIRECT_KEY_MAX) ||
218 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) 173 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
219 return -ENOENT; 174 return -ENOENT;
220 175
221 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); 176 dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
222 if (ret < 0) 177 req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
223 return ret;
224 nilfs_direct_commit_delete(direct, &req, key);
225 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
226 178
227 return 0; 179 ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
180 if (!ret) {
181 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
182 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
183 nilfs_bmap_sub_blocks(bmap, 1);
184 }
185 return ret;
228} 186}
229 187
230static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) 188static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
310 return 0; 268 return 0;
311} 269}
312 270
313static int nilfs_direct_propagate_v(struct nilfs_direct *direct, 271static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
314 struct buffer_head *bh) 272 struct buffer_head *bh)
315{ 273{
316 union nilfs_bmap_ptr_req oldreq, newreq; 274 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
275 struct nilfs_palloc_req oldreq, newreq;
276 struct inode *dat;
317 __u64 key; 277 __u64 key;
318 __u64 ptr; 278 __u64 ptr;
319 int ret; 279 int ret;
320 280
321 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); 281 if (!NILFS_BMAP_USE_VBN(bmap))
282 return 0;
283
284 dat = nilfs_bmap_get_dat(bmap);
285 key = nilfs_bmap_data_get_key(bmap, bh);
322 ptr = nilfs_direct_get_ptr(direct, key); 286 ptr = nilfs_direct_get_ptr(direct, key);
323 if (!buffer_nilfs_volatile(bh)) { 287 if (!buffer_nilfs_volatile(bh)) {
324 oldreq.bpr_ptr = ptr; 288 oldreq.pr_entry_nr = ptr;
325 newreq.bpr_ptr = ptr; 289 newreq.pr_entry_nr = ptr;
326 ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, 290 ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
327 &newreq);
328 if (ret < 0) 291 if (ret < 0)
329 return ret; 292 return ret;
330 nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); 293 nilfs_dat_commit_update(dat, &oldreq, &newreq,
294 bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
331 set_buffer_nilfs_volatile(bh); 295 set_buffer_nilfs_volatile(bh);
332 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); 296 nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
333 } else 297 } else
334 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); 298 ret = nilfs_dat_mark_dirty(dat, ptr);
335 299
336 return ret; 300 return ret;
337} 301}
338 302
339static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
340 struct buffer_head *bh)
341{
342 struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
343
344 return NILFS_BMAP_USE_VBN(bmap) ?
345 nilfs_direct_propagate_v(direct, bh) : 0;
346}
347
348static int nilfs_direct_assign_v(struct nilfs_direct *direct, 303static int nilfs_direct_assign_v(struct nilfs_direct *direct,
349 __u64 key, __u64 ptr, 304 __u64 key, __u64 ptr,
350 struct buffer_head **bh, 305 struct buffer_head **bh,
351 sector_t blocknr, 306 sector_t blocknr,
352 union nilfs_binfo *binfo) 307 union nilfs_binfo *binfo)
353{ 308{
309 struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
354 union nilfs_bmap_ptr_req req; 310 union nilfs_bmap_ptr_req req;
355 int ret; 311 int ret;
356 312
357 req.bpr_ptr = ptr; 313 req.bpr_ptr = ptr;
358 ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); 314 ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
359 if (unlikely(ret < 0)) 315 if (!ret) {
360 return ret; 316 nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
361 317 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
362 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); 318 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
363 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); 319 }
364 320 return ret;
365 return 0;
366} 321}
367 322
368static int nilfs_direct_assign_p(struct nilfs_direct *direct, 323static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
31#include "mdt.h" 31#include "mdt.h"
32#include "alloc.h" 32#include "alloc.h"
33 33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35 34
36static inline struct nilfs_inode * 35static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) 36nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
430 430
431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); 431 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
432 432
433 if (nilfs_read_inode_common(inode, raw_inode)) 433 err = nilfs_read_inode_common(inode, raw_inode);
434 if (err)
434 goto failed_unmap; 435 goto failed_unmap;
435 436
436 if (S_ISREG(inode->i_mode)) { 437 if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
442 const char *msg; 442 const char *msg;
443 int ret; 443 int ret;
444 444
445 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
446 if (ret < 0) {
447 msg = "cannot read source blocks";
448 goto failed;
449 }
450
451 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); 445 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
452 if (ret < 0) { 446 if (ret < 0) {
453 /* 447 /*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
548 } 542 }
549 } 543 }
550 544
551 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); 545 /*
546 * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
547 * which will operates an inode list without blocking.
548 * To protect the list from concurrent operations,
549 * nilfs_ioctl_move_blocks should be atomic operation.
550 */
551 if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
552 ret = -EBUSY;
553 goto out_free;
554 }
555
556 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
557 if (ret < 0)
558 printk(KERN_ERR "NILFS: GC failed during preparation: "
559 "cannot read source blocks: err=%d\n", ret);
560 else
561 ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
562
563 clear_nilfs_gc_running(nilfs);
552 564
553 out_free: 565 out_free:
554 while (--n >= 0) 566 while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2dfd47714ae5..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
103 goto failed_unlock; 103 goto failed_unlock;
104 104
105 err = -EEXIST; 105 err = -EEXIST;
106 if (buffer_uptodate(bh) || buffer_mapped(bh)) 106 if (buffer_uptodate(bh))
107 goto failed_bh; 107 goto failed_bh;
108#if 0 108
109 /* The uptodate flag is not protected by the page lock, but
110 the mapped flag is. Thus, we don't have to wait the buffer. */
111 wait_on_buffer(bh); 109 wait_on_buffer(bh);
112 if (buffer_uptodate(bh)) 110 if (buffer_uptodate(bh))
113 goto failed_bh; 111 goto failed_bh;
114#endif
115 112
116 bh->b_bdev = nilfs->ns_bdev; 113 bh->b_bdev = nilfs->ns_bdev;
117 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); 114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
139 int mode, struct buffer_head **out_bh) 136 int mode, struct buffer_head **out_bh)
140{ 137{
141 struct buffer_head *bh; 138 struct buffer_head *bh;
142 unsigned long blknum = 0; 139 __u64 blknum = 0;
143 int ret = -ENOMEM; 140 int ret = -ENOMEM;
144 141
145 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); 142 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
162 unlock_buffer(bh); 159 unlock_buffer(bh);
163 goto out; 160 goto out;
164 } 161 }
165 if (!buffer_mapped(bh)) { /* unused buffer */ 162
166 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, 163 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
167 &blknum); 164 if (unlikely(ret)) {
168 if (unlikely(ret)) { 165 unlock_buffer(bh);
169 unlock_buffer(bh); 166 goto failed_bh;
170 goto failed_bh;
171 }
172 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
173 bh->b_blocknr = blknum;
174 set_buffer_mapped(bh);
175 } 167 }
168 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
169 bh->b_blocknr = (sector_t)blknum;
170 set_buffer_mapped(bh);
176 171
177 bh->b_end_io = end_buffer_read_sync; 172 bh->b_end_io = end_buffer_read_sync;
178 get_bh(bh); 173 get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
402 struct inode *inode = container_of(page->mapping, 397 struct inode *inode = container_of(page->mapping,
403 struct inode, i_data); 398 struct inode, i_data);
404 struct super_block *sb = inode->i_sb; 399 struct super_block *sb = inode->i_sb;
400 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
405 struct nilfs_sb_info *writer = NULL; 401 struct nilfs_sb_info *writer = NULL;
406 int err = 0; 402 int err = 0;
407 403
@@ -411,9 +407,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
411 if (page->mapping->assoc_mapping) 407 if (page->mapping->assoc_mapping)
412 return 0; /* Do not request flush for shadow page cache */ 408 return 0; /* Do not request flush for shadow page cache */
413 if (!sb) { 409 if (!sb) {
414 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); 410 down_read(&nilfs->ns_writer_sem);
411 writer = nilfs->ns_writer;
415 if (!writer) { 412 if (!writer) {
416 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); 413 up_read(&nilfs->ns_writer_sem);
417 return -EROFS; 414 return -EROFS;
418 } 415 }
419 sb = writer->s_super; 416 sb = writer->s_super;
@@ -425,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
425 nilfs_flush_segment(sb, inode->i_ino); 422 nilfs_flush_segment(sb, inode->i_ino);
426 423
427 if (writer) 424 if (writer)
428 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); 425 up_read(&nilfs->ns_writer_sem);
429 return err; 426 return err;
430} 427}
431 428
@@ -516,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
516} 513}
517 514
518struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, 515struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
519 ino_t ino, gfp_t gfp_mask) 516 ino_t ino)
520{ 517{
521 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); 518 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
519 NILFS_MDT_GFP);
522 520
523 if (!inode) 521 if (!inode)
524 return NULL; 522 return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); 74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *); 75int nilfs_mdt_fetch_dirty(struct inode *);
76 76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, 77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, 78struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t); 79 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *); 80void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
552 printk(KERN_WARNING 552 printk(KERN_WARNING
553 "NILFS warning: error recovering data block " 553 "NILFS warning: error recovering data block "
554 "(err=%d, ino=%lu, block-offset=%llu)\n", 554 "(err=%d, ino=%lu, block-offset=%llu)\n",
555 err, rb->ino, (unsigned long long)rb->blkoff); 555 err, (unsigned long)rb->ino,
556 (unsigned long long)rb->blkoff);
556 if (!err2) 557 if (!err2)
557 err2 = err; 558 err2 = err;
558 next: 559 next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316{ 316{
317 struct bio *bio; 317 struct bio *bio;
318 318
319 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 319 bio = bio_alloc(GFP_NOIO, nr_vecs);
320 if (bio == NULL) { 320 if (bio == NULL) {
321 while (!bio && (nr_vecs >>= 1)) 321 while (!bio && (nr_vecs >>= 1))
322 bio = bio_alloc(GFP_NOWAIT, nr_vecs); 322 bio = bio_alloc(GFP_NOIO, nr_vecs);
323 } 323 }
324 if (likely(bio)) { 324 if (likely(bio)) {
325 bio->bi_bdev = sb->s_bdev; 325 bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 51ff3d0a4ee2..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2501,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2501 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && 2501 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2502 nilfs_discontinued(nilfs)) { 2502 nilfs_discontinued(nilfs)) {
2503 down_write(&nilfs->ns_sem); 2503 down_write(&nilfs->ns_sem);
2504 req->sb_err = nilfs_commit_super(sbi, 0); 2504 req->sb_err = nilfs_commit_super(sbi,
2505 nilfs_altsb_need_update(nilfs));
2505 up_write(&nilfs->ns_sem); 2506 up_write(&nilfs->ns_sem);
2506 } 2507 }
2507 } 2508 }
@@ -2689,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
2689 } else { 2690 } else {
2690 DEFINE_WAIT(wait); 2691 DEFINE_WAIT(wait);
2691 int should_sleep = 1; 2692 int should_sleep = 1;
2693 struct the_nilfs *nilfs;
2692 2694
2693 prepare_to_wait(&sci->sc_wait_daemon, &wait, 2695 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2694 TASK_INTERRUPTIBLE); 2696 TASK_INTERRUPTIBLE);
@@ -2709,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
2709 finish_wait(&sci->sc_wait_daemon, &wait); 2711 finish_wait(&sci->sc_wait_daemon, &wait);
2710 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && 2712 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2711 time_after_eq(jiffies, sci->sc_timer->expires)); 2713 time_after_eq(jiffies, sci->sc_timer->expires));
2714 nilfs = sci->sc_sbi->s_nilfs;
2715 if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
2716 set_nilfs_discontinued(nilfs);
2712 } 2717 }
2713 goto loop; 2718 goto loop;
2714 2719
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
28#include <linux/nilfs2_fs.h> 28#include <linux/nilfs2_fs.h>
29#include "mdt.h" 29#include "mdt.h"
30 30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32 31
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) 32static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{ 33{
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 151964f0de4c..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
50#include <linux/writeback.h> 50#include <linux/writeback.h>
51#include <linux/kobject.h> 51#include <linux/kobject.h>
52#include <linux/exportfs.h> 52#include <linux/exportfs.h>
53#include <linux/seq_file.h>
54#include <linux/mount.h>
53#include "nilfs.h" 55#include "nilfs.h"
54#include "mdt.h" 56#include "mdt.h"
55#include "alloc.h" 57#include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)"); 67 "(NILFS)");
66MODULE_LICENSE("GPL"); 68MODULE_LICENSE("GPL");
67 69
68static void nilfs_write_super(struct super_block *sb);
69static int nilfs_remount(struct super_block *sb, int *flags, char *data); 70static int nilfs_remount(struct super_block *sb, int *flags, char *data);
70 71
71/** 72/**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
311 312
312 lock_kernel(); 313 lock_kernel();
313 314
314 if (sb->s_dirt)
315 nilfs_write_super(sb);
316
317 nilfs_detach_segment_constructor(sbi); 315 nilfs_detach_segment_constructor(sbi);
318 316
319 if (!(sb->s_flags & MS_RDONLY)) { 317 if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
336 unlock_kernel(); 334 unlock_kernel();
337} 335}
338 336
339/** 337static int nilfs_sync_fs(struct super_block *sb, int wait)
340 * nilfs_write_super - write super block(s) of NILFS
341 * @sb: super_block
342 *
343 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
344 * clears s_dirt. This function is called in the section protected by
345 * lock_super().
346 *
347 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
348 * of the struct the_nilfs. Lock order must be as follows:
349 *
350 * 1. lock_super()
351 * 2. down_write(&nilfs->ns_sem)
352 *
353 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
354 * of the super block (nilfs->ns_sbp[]).
355 *
356 * In most cases, VFS functions call lock_super() before calling these
357 * methods. So we must be careful not to bring on deadlocks when using
358 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
359 *
360 * Note that order of lock_kernel() and lock_super() depends on contexts
361 * of VFS. We should also note that lock_kernel() can be used in its
362 * protective section and only the outermost one has an effect.
363 */
364static void nilfs_write_super(struct super_block *sb)
365{ 338{
366 struct nilfs_sb_info *sbi = NILFS_SB(sb); 339 struct nilfs_sb_info *sbi = NILFS_SB(sb);
367 struct the_nilfs *nilfs = sbi->s_nilfs; 340 struct the_nilfs *nilfs = sbi->s_nilfs;
368
369 down_write(&nilfs->ns_sem);
370 if (!(sb->s_flags & MS_RDONLY)) {
371 struct nilfs_super_block **sbp = nilfs->ns_sbp;
372 u64 t = get_seconds();
373 int dupsb;
374
375 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
376 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
377 up_write(&nilfs->ns_sem);
378 return;
379 }
380 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
381 nilfs_commit_super(sbi, dupsb);
382 }
383 sb->s_dirt = 0;
384 up_write(&nilfs->ns_sem);
385}
386
387static int nilfs_sync_fs(struct super_block *sb, int wait)
388{
389 int err = 0; 341 int err = 0;
390 342
391 nilfs_write_super(sb);
392
393 /* This function is called when super block should be written back */ 343 /* This function is called when super block should be written back */
394 if (wait) 344 if (wait)
395 err = nilfs_construct_segment(sb); 345 err = nilfs_construct_segment(sb);
346
347 down_write(&nilfs->ns_sem);
348 if (sb->s_dirt)
349 nilfs_commit_super(sbi, 1);
350 up_write(&nilfs->ns_sem);
351
396 return err; 352 return err;
397} 353}
398 354
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
407 list_add(&sbi->s_list, &nilfs->ns_supers); 363 list_add(&sbi->s_list, &nilfs->ns_supers);
408 up_write(&nilfs->ns_super_sem); 364 up_write(&nilfs->ns_super_sem);
409 365
410 sbi->s_ifile = nilfs_mdt_new( 366 sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
411 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
412 if (!sbi->s_ifile) 367 if (!sbi->s_ifile)
413 return -ENOMEM; 368 return -ENOMEM;
414 369
@@ -529,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
529 return 0; 484 return 0;
530} 485}
531 486
487static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
488{
489 struct super_block *sb = vfs->mnt_sb;
490 struct nilfs_sb_info *sbi = NILFS_SB(sb);
491
492 if (!nilfs_test_opt(sbi, BARRIER))
493 seq_printf(seq, ",barrier=off");
494 if (nilfs_test_opt(sbi, SNAPSHOT))
495 seq_printf(seq, ",cp=%llu",
496 (unsigned long long int)sbi->s_snapshot_cno);
497 if (nilfs_test_opt(sbi, ERRORS_RO))
498 seq_printf(seq, ",errors=remount-ro");
499 if (nilfs_test_opt(sbi, ERRORS_PANIC))
500 seq_printf(seq, ",errors=panic");
501 if (nilfs_test_opt(sbi, STRICT_ORDER))
502 seq_printf(seq, ",order=strict");
503
504 return 0;
505}
506
532static struct super_operations nilfs_sops = { 507static struct super_operations nilfs_sops = {
533 .alloc_inode = nilfs_alloc_inode, 508 .alloc_inode = nilfs_alloc_inode,
534 .destroy_inode = nilfs_destroy_inode, 509 .destroy_inode = nilfs_destroy_inode,
@@ -538,7 +513,7 @@ static struct super_operations nilfs_sops = {
538 /* .drop_inode = nilfs_drop_inode, */ 513 /* .drop_inode = nilfs_drop_inode, */
539 .delete_inode = nilfs_delete_inode, 514 .delete_inode = nilfs_delete_inode,
540 .put_super = nilfs_put_super, 515 .put_super = nilfs_put_super,
541 .write_super = nilfs_write_super, 516 /* .write_super = nilfs_write_super, */
542 .sync_fs = nilfs_sync_fs, 517 .sync_fs = nilfs_sync_fs,
543 /* .write_super_lockfs */ 518 /* .write_super_lockfs */
544 /* .unlockfs */ 519 /* .unlockfs */
@@ -546,7 +521,7 @@ static struct super_operations nilfs_sops = {
546 .remount_fs = nilfs_remount, 521 .remount_fs = nilfs_remount,
547 .clear_inode = nilfs_clear_inode, 522 .clear_inode = nilfs_clear_inode,
548 /* .umount_begin */ 523 /* .umount_begin */
549 /* .show_options */ 524 .show_options = nilfs_show_options
550}; 525};
551 526
552static struct inode * 527static struct inode *
@@ -816,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
816 791
817 if (sb->s_flags & MS_RDONLY) { 792 if (sb->s_flags & MS_RDONLY) {
818 if (nilfs_test_opt(sbi, SNAPSHOT)) { 793 if (nilfs_test_opt(sbi, SNAPSHOT)) {
794 down_read(&nilfs->ns_segctor_sem);
819 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, 795 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
820 sbi->s_snapshot_cno); 796 sbi->s_snapshot_cno);
821 if (err < 0) 797 up_read(&nilfs->ns_segctor_sem);
798 if (err < 0) {
799 if (err == -ENOENT)
800 err = -EINVAL;
822 goto failed_sbi; 801 goto failed_sbi;
802 }
823 if (!err) { 803 if (!err) {
824 printk(KERN_ERR 804 printk(KERN_ERR
825 "NILFS: The specified checkpoint is " 805 "NILFS: The specified checkpoint is "
@@ -1127,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
1127 */ 1107 */
1128 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); 1108 sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
1129 1109
1130 if (!sd.cno)
1131 /* trying to get the latest checkpoint. */
1132 sd.cno = nilfs_last_cno(nilfs);
1133
1134 /* 1110 /*
1135 * Get super block instance holding the nilfs_sb_info struct. 1111 * Get super block instance holding the nilfs_sb_info struct.
1136 * A new instance is allocated if no existing mount is present or 1112 * A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
68 68
69 nilfs->ns_bdev = bdev; 69 nilfs->ns_bdev = bdev;
70 atomic_set(&nilfs->ns_count, 1); 70 atomic_set(&nilfs->ns_count, 1);
71 atomic_set(&nilfs->ns_writer_refcount, -1);
72 atomic_set(&nilfs->ns_ndirtyblks, 0); 71 atomic_set(&nilfs->ns_ndirtyblks, 0);
73 init_rwsem(&nilfs->ns_sem); 72 init_rwsem(&nilfs->ns_sem);
74 init_rwsem(&nilfs->ns_super_sem); 73 init_rwsem(&nilfs->ns_super_sem);
75 mutex_init(&nilfs->ns_mount_mutex); 74 mutex_init(&nilfs->ns_mount_mutex);
76 mutex_init(&nilfs->ns_writer_mutex); 75 init_rwsem(&nilfs->ns_writer_sem);
77 INIT_LIST_HEAD(&nilfs->ns_list); 76 INIT_LIST_HEAD(&nilfs->ns_list);
78 INIT_LIST_HEAD(&nilfs->ns_supers); 77 INIT_LIST_HEAD(&nilfs->ns_supers);
79 spin_lock_init(&nilfs->ns_last_segment_lock); 78 spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
188 inode_size = nilfs->ns_inode_size; 187 inode_size = nilfs->ns_inode_size;
189 188
190 err = -ENOMEM; 189 err = -ENOMEM;
191 nilfs->ns_dat = nilfs_mdt_new( 190 nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
192 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
193 if (unlikely(!nilfs->ns_dat)) 191 if (unlikely(!nilfs->ns_dat))
194 goto failed; 192 goto failed;
195 193
196 nilfs->ns_gc_dat = nilfs_mdt_new( 194 nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
197 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
198 if (unlikely(!nilfs->ns_gc_dat)) 195 if (unlikely(!nilfs->ns_gc_dat))
199 goto failed_dat; 196 goto failed_dat;
200 197
201 nilfs->ns_cpfile = nilfs_mdt_new( 198 nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
202 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
203 if (unlikely(!nilfs->ns_cpfile)) 199 if (unlikely(!nilfs->ns_cpfile))
204 goto failed_gc_dat; 200 goto failed_gc_dat;
205 201
206 nilfs->ns_sufile = nilfs_mdt_new( 202 nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
207 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
208 if (unlikely(!nilfs->ns_sufile)) 203 if (unlikely(!nilfs->ns_sufile))
209 goto failed_cpfile; 204 goto failed_cpfile;
210 205
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
596 591
597 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state); 592 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
598 593
599 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info; 594 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
600 if (!bdi)
601 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
602 nilfs->ns_bdi = bdi ? : &default_backing_dev_info; 595 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
603 596
604 /* Finding last segment */ 597 /* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 1b9caafb8662..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and 37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */ 38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ 39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40 THE_NILFS_GC_RUNNING, /* gc process is running */
40}; 41};
41 42
42/** 43/**
@@ -50,8 +51,7 @@ enum {
50 * @ns_sem: semaphore for shared states 51 * @ns_sem: semaphore for shared states
51 * @ns_super_sem: semaphore for global operations across super block instances 52 * @ns_super_sem: semaphore for global operations across super block instances
52 * @ns_mount_mutex: mutex protecting mount process of nilfs 53 * @ns_mount_mutex: mutex protecting mount process of nilfs
53 * @ns_writer_mutex: mutex protecting ns_writer attach/detach 54 * @ns_writer_sem: semaphore protecting ns_writer attach/detach
54 * @ns_writer_refcount: number of referrers on ns_writer
55 * @ns_current: back pointer to current mount 55 * @ns_current: back pointer to current mount
56 * @ns_sbh: buffer heads of on-disk super blocks 56 * @ns_sbh: buffer heads of on-disk super blocks
57 * @ns_sbp: pointers to super block data 57 * @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
100 struct rw_semaphore ns_sem; 100 struct rw_semaphore ns_sem;
101 struct rw_semaphore ns_super_sem; 101 struct rw_semaphore ns_super_sem;
102 struct mutex ns_mount_mutex; 102 struct mutex ns_mount_mutex;
103 struct mutex ns_writer_mutex; 103 struct rw_semaphore ns_writer_sem;
104 atomic_t ns_writer_refcount;
105 104
106 /* 105 /*
107 * components protected by ns_super_sem 106 * components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \
197THE_NILFS_FNS(INIT, init) 196THE_NILFS_FNS(INIT, init)
198THE_NILFS_FNS(LOADED, loaded) 197THE_NILFS_FNS(LOADED, loaded)
199THE_NILFS_FNS(DISCONTINUED, discontinued) 198THE_NILFS_FNS(DISCONTINUED, discontinued)
199THE_NILFS_FNS(GC_RUNNING, gc_running)
200 200
201/* Minimum interval of periodical update of superblocks (in seconds) */ 201/* Minimum interval of periodical update of superblocks (in seconds) */
202#define NILFS_SB_FREQ 10 202#define NILFS_SB_FREQ 10
203#define NILFS_ALTSB_FREQ 60 /* spare superblock */ 203#define NILFS_ALTSB_FREQ 60 /* spare superblock */
204 204
205static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
206{
207 u64 t = get_seconds();
208 return t < nilfs->ns_sbwtime[0] ||
209 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
210}
211
212static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
213{
214 u64 t = get_seconds();
215 struct nilfs_super_block **sbp = nilfs->ns_sbp;
216 return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
217}
218
205void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 219void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
206struct the_nilfs *find_or_create_nilfs(struct block_device *); 220struct the_nilfs *find_or_create_nilfs(struct block_device *);
207void put_nilfs(struct the_nilfs *); 221void put_nilfs(struct the_nilfs *);
@@ -221,34 +235,21 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
221 atomic_inc(&nilfs->ns_count); 235 atomic_inc(&nilfs->ns_count);
222} 236}
223 237
224static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
225{
226 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
227 mutex_lock(&nilfs->ns_writer_mutex);
228 return nilfs->ns_writer;
229}
230
231static inline void nilfs_put_writer(struct the_nilfs *nilfs)
232{
233 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
234 mutex_unlock(&nilfs->ns_writer_mutex);
235}
236
237static inline void 238static inline void
238nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 239nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
239{ 240{
240 mutex_lock(&nilfs->ns_writer_mutex); 241 down_write(&nilfs->ns_writer_sem);
241 nilfs->ns_writer = sbi; 242 nilfs->ns_writer = sbi;
242 mutex_unlock(&nilfs->ns_writer_mutex); 243 up_write(&nilfs->ns_writer_sem);
243} 244}
244 245
245static inline void 246static inline void
246nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 247nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
247{ 248{
248 mutex_lock(&nilfs->ns_writer_mutex); 249 down_write(&nilfs->ns_writer_sem);
249 if (sbi == nilfs->ns_writer) 250 if (sbi == nilfs->ns_writer)
250 nilfs->ns_writer = NULL; 251 nilfs->ns_writer = NULL;
251 mutex_unlock(&nilfs->ns_writer_mutex); 252 up_write(&nilfs->ns_writer_sem);
252} 253}
253 254
254static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) 255static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
2076 *ppos = pos; 2076 *ppos = pos;
2077 if (cached_page) 2077 if (cached_page)
2078 page_cache_release(cached_page); 2078 page_cache_release(cached_page);
2079 /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
2080 if (likely(!status)) {
2081 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
2082 if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
2083 status = generic_osync_inode(vi, mapping,
2084 OSYNC_METADATA|OSYNC_DATA);
2085 }
2086 }
2087 pagevec_lru_add_file(&lru_pvec); 2079 pagevec_lru_add_file(&lru_pvec);
2088 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2080 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2089 written ? "written" : "status", (unsigned long)written, 2081 written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2145 mutex_lock(&inode->i_mutex); 2137 mutex_lock(&inode->i_mutex);
2146 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); 2138 ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
2147 mutex_unlock(&inode->i_mutex); 2139 mutex_unlock(&inode->i_mutex);
2148 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2140 if (ret > 0) {
2149 int err = sync_page_range(inode, mapping, pos, ret); 2141 int err = generic_write_sync(file, pos, ret);
2150 if (err < 0) 2142 if (err < 0)
2151 ret = err; 2143 ret = err;
2152 } 2144 }
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
2173 if (ret == -EIOCBQUEUED) 2165 if (ret == -EIOCBQUEUED)
2174 ret = wait_on_sync_kiocb(&kiocb); 2166 ret = wait_on_sync_kiocb(&kiocb);
2175 mutex_unlock(&inode->i_mutex); 2167 mutex_unlock(&inode->i_mutex);
2176 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2168 if (ret > 0) {
2177 int err = sync_page_range(inode, mapping, *ppos - ret, ret); 2169 int err = generic_write_sync(file, *ppos - ret, ret);
2178 if (err < 0) 2170 if (err < 0)
2179 ret = err; 2171 ret = err;
2180 } 2172 }
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
384 * it is dirty in the inode meta data rather than the data page cache of the 384 * it is dirty in the inode meta data rather than the data page cache of the
385 * inode, and thus there are no data pages that need writing out. Therefore, a 385 * inode, and thus there are no data pages that need writing out. Therefore, a
386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the 386 * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
387 * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to 387 * other hand, is not sufficient, because ->write_inode needs to be called even
388 * ensure ->write_inode is called from generic_osync_inode() and this needs to 388 * in case of fdatasync. This needs to happen or the file data would not
389 * happen or the file data would not necessarily hit the device synchronously, 389 * necessarily hit the device synchronously, even though the vfs inode has the
390 * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC 390 * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
391 * simply "feels" better than just I_DIRTY_SYNC, since the file data has not 391 * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
392 * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own 392 * which is not what I_DIRTY_SYNC on its own would suggest.
393 * would suggest.
394 */ 393 */
395void __mark_mft_record_dirty(ntfs_inode *ni) 394void __mark_mft_record_dirty(ntfs_inode *ni)
396{ 395{
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b401654011a2..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1747,8 +1747,8 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
1747 * we know zeros will only be needed in the first and/or last cluster. 1747 * we know zeros will only be needed in the first and/or last cluster.
1748 */ 1748 */
1749 if (clusters_to_alloc || extents_to_split || 1749 if (clusters_to_alloc || extents_to_split ||
1750 wc->w_desc[0].c_needs_zero || 1750 (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
1751 wc->w_desc[wc->w_clen - 1].c_needs_zero) 1751 wc->w_desc[wc->w_clen - 1].c_needs_zero)))
1752 cluster_of_pages = 1; 1752 cluster_of_pages = 1;
1753 else 1753 else
1754 cluster_of_pages = 0; 1754 cluster_of_pages = 0;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 2f28b7de2c8d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
85 goto bail; 85 goto bail;
86 } 86 }
87 87
88 /*
89 * If the last lookup failed to create dentry lock, let us
90 * redo it.
91 */
92 if (!dentry->d_fsdata) {
93 mlog(0, "Inode %llu doesn't have dentry lock, "
94 "returning false\n",
95 (unsigned long long)OCFS2_I(inode)->ip_blkno);
96 goto bail;
97 }
98
88 ret = 1; 99 ret = 1;
89 100
90bail: 101bail:
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
325} 325}
326 326
327static struct backing_dev_info dlmfs_backing_dev_info = { 327static struct backing_dev_info dlmfs_backing_dev_info = {
328 .name = "ocfs2-dlmfs",
328 .ra_pages = 0, /* No readahead */ 329 .ra_pages = 0, /* No readahead */
329 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 330 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
330}; 331};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index aa501d3f93f1..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1871,8 +1871,7 @@ relock:
1871 goto out_dio; 1871 goto out_dio;
1872 } 1872 }
1873 } else { 1873 } else {
1874 written = generic_file_aio_write_nolock(iocb, iov, nr_segs, 1874 written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
1875 *ppos);
1876 } 1875 }
1877 1876
1878out_dio: 1877out_dio:
@@ -1880,18 +1879,21 @@ out_dio:
1880 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); 1879 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1881 1880
1882 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { 1881 if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
1883 /* 1882 ret = filemap_fdatawrite_range(file->f_mapping, pos,
1884 * The generic write paths have handled getting data 1883 pos + count - 1);
1885 * to disk, but since we don't make use of the dirty 1884 if (ret < 0)
1886 * inode list, a manual journal commit is necessary 1885 written = ret;
1887 * here. 1886
1888 */ 1887 if (!ret && (old_size != i_size_read(inode) ||
1889 if (old_size != i_size_read(inode) || 1888 old_clusters != OCFS2_I(inode)->ip_clusters)) {
1890 old_clusters != OCFS2_I(inode)->ip_clusters) {
1891 ret = jbd2_journal_force_commit(osb->journal->j_journal); 1889 ret = jbd2_journal_force_commit(osb->journal->j_journal);
1892 if (ret < 0) 1890 if (ret < 0)
1893 written = ret; 1891 written = ret;
1894 } 1892 }
1893
1894 if (!ret)
1895 ret = filemap_fdatawait_range(file->f_mapping, pos,
1896 pos + count - 1);
1895 } 1897 }
1896 1898
1897 /* 1899 /*
@@ -1991,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1991 1993
1992 if (ret > 0) { 1994 if (ret > 0) {
1993 unsigned long nr_pages; 1995 unsigned long nr_pages;
1996 int err;
1994 1997
1995 *ppos += ret;
1996 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1998 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1997 1999
1998 /* 2000 err = generic_write_sync(out, *ppos, ret);
1999 * If file or inode is SYNC and we actually wrote some data, 2001 if (err)
2000 * sync it. 2002 ret = err;
2001 */ 2003 else
2002 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 2004 *ppos += ret;
2003 int err;
2004
2005 mutex_lock(&inode->i_mutex);
2006 err = ocfs2_rw_lock(inode, 1);
2007 if (err < 0) {
2008 mlog_errno(err);
2009 } else {
2010 err = generic_osync_inode(inode, mapping,
2011 OSYNC_METADATA|OSYNC_DATA);
2012 ocfs2_rw_unlock(inode, 1);
2013 }
2014 mutex_unlock(&inode->i_mutex);
2015 2005
2016 if (err)
2017 ret = err;
2018 }
2019 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2006 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2020 } 2007 }
2021 2008
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 199int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
200 struct file *filp) 200 struct file *filp)
201{ 201{
202 int err; 202 int ret;
203 struct iattr newattrs; 203 struct iattr newattrs;
204 204
205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ 205 /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
214 } 214 }
215 215
216 /* Remove suid/sgid on truncate too */ 216 /* Remove suid/sgid on truncate too */
217 newattrs.ia_valid |= should_remove_suid(dentry); 217 ret = should_remove_suid(dentry);
218 if (ret)
219 newattrs.ia_valid |= ret | ATTR_FORCE;
218 220
219 mutex_lock(&dentry->d_inode->i_mutex); 221 mutex_lock(&dentry->d_inode->i_mutex);
220 err = notify_change(dentry, &newattrs); 222 ret = notify_change(dentry, &newattrs);
221 mutex_unlock(&dentry->d_inode->i_mutex); 223 mutex_unlock(&dentry->d_inode->i_mutex);
222 return err; 224 return ret;
223} 225}
224 226
225static long do_sys_truncate(const char __user *pathname, loff_t length) 227static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
957 int error; 959 int error;
958 struct file *f; 960 struct file *f;
959 961
962 validate_creds(cred);
963
960 /* 964 /*
961 * We must always pass in a valid mount pointer. Historically 965 * We must always pass in a valid mount pointer. Historically
962 * callers got away with not passing it, but we must enforce this at 966 * callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
248 part_stat_read(p, merges[WRITE]), 248 part_stat_read(p, merges[WRITE]),
249 (unsigned long long)part_stat_read(p, sectors[WRITE]), 249 (unsigned long long)part_stat_read(p, sectors[WRITE]),
250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), 250 jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
251 p->in_flight, 251 part_in_flight(p),
252 jiffies_to_msecs(part_stat_read(p, io_ticks)), 252 jiffies_to_msecs(part_stat_read(p, io_ticks)),
253 jiffies_to_msecs(part_stat_read(p, time_in_queue))); 253 jiffies_to_msecs(part_stat_read(p, time_in_queue)));
254} 254}
255 255
256ssize_t part_inflight_show(struct device *dev,
257 struct device_attribute *attr, char *buf)
258{
259 struct hd_struct *p = dev_to_part(dev);
260
261 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
262}
263
256#ifdef CONFIG_FAIL_MAKE_REQUEST 264#ifdef CONFIG_FAIL_MAKE_REQUEST
257ssize_t part_fail_show(struct device *dev, 265ssize_t part_fail_show(struct device *dev,
258 struct device_attribute *attr, char *buf) 266 struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
281static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); 289static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
282static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); 290static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
283static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); 291static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
292static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
284#ifdef CONFIG_FAIL_MAKE_REQUEST 293#ifdef CONFIG_FAIL_MAKE_REQUEST
285static struct device_attribute dev_attr_fail = 294static struct device_attribute dev_attr_fail =
286 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); 295 __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
292 &dev_attr_size.attr, 301 &dev_attr_size.attr,
293 &dev_attr_alignment_offset.attr, 302 &dev_attr_alignment_offset.attr,
294 &dev_attr_stat.attr, 303 &dev_attr_stat.attr,
304 &dev_attr_inflight.attr,
295#ifdef CONFIG_FAIL_MAKE_REQUEST 305#ifdef CONFIG_FAIL_MAKE_REQUEST
296 &dev_attr_fail.attr, 306 &dev_attr_fail.attr,
297#endif 307#endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
302 .attrs = part_attrs, 312 .attrs = part_attrs,
303}; 313};
304 314
305static struct attribute_group *part_attr_groups[] = { 315static const struct attribute_group *part_attr_groups[] = {
306 &part_attr_group, 316 &part_attr_group,
307#ifdef CONFIG_BLK_DEV_IO_TRACE 317#ifdef CONFIG_BLK_DEV_IO_TRACE
308 &blk_trace_attr_group, 318 &blk_trace_attr_group,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
46static const struct inode_operations ramfs_dir_inode_operations; 46static const struct inode_operations ramfs_dir_inode_operations;
47 47
48static struct backing_dev_info ramfs_backing_dev_info = { 48static struct backing_dev_info ramfs_backing_dev_info = {
49 .name = "ramfs",
49 .ra_pages = 0, /* No readahead */ 50 .ra_pages = 0, /* No readahead */
50 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | 51 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
51 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | 52 BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
502 len = left; 502 len = left;
503 503
504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 504 ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
505 if (ret > 0) 505 if (ret > 0) {
506 *ppos += ret; 506 *ppos += ret;
507 file_accessed(in);
508 }
507 509
508 return ret; 510 return ret;
509} 511}
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
963 965
964 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 966 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
965 ret = file_remove_suid(out); 967 ret = file_remove_suid(out);
966 if (!ret) 968 if (!ret) {
969 file_update_time(out);
967 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); 970 ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
971 }
968 mutex_unlock(&inode->i_mutex); 972 mutex_unlock(&inode->i_mutex);
969 } while (ret > 0); 973 } while (ret > 0);
970 splice_from_pipe_end(pipe, &sd); 974 splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
976 980
977 if (ret > 0) { 981 if (ret > 0) {
978 unsigned long nr_pages; 982 unsigned long nr_pages;
983 int err;
979 984
980 *ppos += ret;
981 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 985 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
982 986
983 /* 987 err = generic_write_sync(out, *ppos, ret);
984 * If file or inode is SYNC and we actually wrote some data, 988 if (err)
985 * sync it. 989 ret = err;
986 */ 990 else
987 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 991 *ppos += ret;
988 int err;
989
990 mutex_lock(&inode->i_mutex);
991 err = generic_osync_inode(inode, mapping,
992 OSYNC_METADATA|OSYNC_DATA);
993 mutex_unlock(&inode->i_mutex);
994
995 if (err)
996 ret = err;
997 }
998 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 992 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
999 } 993 }
1000 994
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
62 s = NULL; 62 s = NULL;
63 goto out; 63 goto out;
64 } 64 }
65 INIT_LIST_HEAD(&s->s_dirty);
66 INIT_LIST_HEAD(&s->s_io);
67 INIT_LIST_HEAD(&s->s_more_io);
68 INIT_LIST_HEAD(&s->s_files); 65 INIT_LIST_HEAD(&s->s_files);
69 INIT_LIST_HEAD(&s->s_instances); 66 INIT_LIST_HEAD(&s->s_instances);
70 INIT_HLIST_HEAD(&s->s_anon); 67 INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
171 * Drops a temporary reference, frees superblock if there's no 168 * Drops a temporary reference, frees superblock if there's no
172 * references left. 169 * references left.
173 */ 170 */
174static void put_super(struct super_block *sb) 171void put_super(struct super_block *sb)
175{ 172{
176 spin_lock(&sb_lock); 173 spin_lock(&sb_lock);
177 __put_super(sb); 174 __put_super(sb);
@@ -710,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
710{ 707{
711 s->s_bdev = data; 708 s->s_bdev = data;
712 s->s_dev = s->s_bdev->bd_dev; 709 s->s_dev = s->s_bdev->bd_dev;
710
711 /*
712 * We set the bdi here to the queue backing, file systems can
713 * overwrite this in ->fill_super()
714 */
715 s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
713 return 0; 716 return 0;
714} 717}
715 718
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
19 SYNC_FILE_RANGE_WAIT_AFTER) 19 SYNC_FILE_RANGE_WAIT_AFTER)
20 20
21/* 21/*
22 * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) 22 * Do the filesystem syncing work. For simple filesystems
23 * just dirties buffers with inodes so we have to submit IO for these buffers 23 * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
24 * via __sync_blockdev(). This also speeds up the wait == 1 case since in that 24 * submit IO for these buffers via __sync_blockdev(). This also speeds up the
25 * case write_inode() functions do sync_dirty_buffer() and thus effectively 25 * wait == 1 case since in that case write_inode() functions do
26 * write one block at a time. 26 * sync_dirty_buffer() and thus effectively write one block at a time.
27 */ 27 */
28static int __sync_filesystem(struct super_block *sb, int wait) 28static int __sync_filesystem(struct super_block *sb, int wait)
29{ 29{
30 /*
31 * This should be safe, as we require bdi backing to actually
32 * write out data in the first place
33 */
34 if (!sb->s_bdi)
35 return 0;
36
30 /* Avoid doing twice syncing and cache pruning for quota sync */ 37 /* Avoid doing twice syncing and cache pruning for quota sync */
31 if (!wait) 38 if (!wait) {
32 writeout_quota_sb(sb, -1); 39 writeout_quota_sb(sb, -1);
33 else 40 writeback_inodes_sb(sb);
41 } else {
34 sync_quota_sb(sb, -1); 42 sync_quota_sb(sb, -1);
35 sync_inodes_sb(sb, wait); 43 sync_inodes_sb(sb);
44 }
36 if (sb->s_op->sync_fs) 45 if (sb->s_op->sync_fs)
37 sb->s_op->sync_fs(sb, wait); 46 sb->s_op->sync_fs(sb, wait);
38 return __sync_blockdev(sb->s_bdev, wait); 47 return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
99 spin_unlock(&sb_lock); 108 spin_unlock(&sb_lock);
100 109
101 down_read(&sb->s_umount); 110 down_read(&sb->s_umount);
102 if (!(sb->s_flags & MS_RDONLY) && sb->s_root) 111 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
103 __sync_filesystem(sb, wait); 112 __sync_filesystem(sb, wait);
104 up_read(&sb->s_umount); 113 up_read(&sb->s_umount);
105 114
@@ -118,7 +127,7 @@ restart:
118 */ 127 */
119SYSCALL_DEFINE0(sync) 128SYSCALL_DEFINE0(sync)
120{ 129{
121 wakeup_pdflush(0); 130 wakeup_flusher_threads(0);
122 sync_filesystems(0); 131 sync_filesystems(0);
123 sync_filesystems(1); 132 sync_filesystems(1);
124 if (unlikely(laptop_mode)) 133 if (unlikely(laptop_mode))
@@ -176,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
176} 185}
177 186
178/** 187/**
179 * vfs_fsync - perform a fsync or fdatasync on a file 188 * vfs_fsync_range - helper to sync a range of data & metadata to disk
180 * @file: file to sync 189 * @file: file to sync
181 * @dentry: dentry of @file 190 * @dentry: dentry of @file
182 * @data: only perform a fdatasync operation 191 * @start: offset in bytes of the beginning of data range to sync
192 * @end: offset in bytes of the end of data range (inclusive)
193 * @datasync: perform only datasync
183 * 194 *
184 * Write back data and metadata for @file to disk. If @datasync is 195 * Write back data in range @start..@end and metadata for @file to disk. If
185 * set only metadata needed to access modified file data is written. 196 * @datasync is set only metadata needed to access modified file data is
197 * written.
186 * 198 *
187 * In case this function is called from nfsd @file may be %NULL and 199 * In case this function is called from nfsd @file may be %NULL and
188 * only @dentry is set. This can only happen when the filesystem 200 * only @dentry is set. This can only happen when the filesystem
189 * implements the export_operations API. 201 * implements the export_operations API.
190 */ 202 */
191int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 203int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
204 loff_t end, int datasync)
192{ 205{
193 const struct file_operations *fop; 206 const struct file_operations *fop;
194 struct address_space *mapping; 207 struct address_space *mapping;
@@ -212,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
212 goto out; 225 goto out;
213 } 226 }
214 227
215 ret = filemap_fdatawrite(mapping); 228 ret = filemap_write_and_wait_range(mapping, start, end);
216 229
217 /* 230 /*
218 * We need to protect against concurrent writers, which could cause 231 * We need to protect against concurrent writers, which could cause
@@ -223,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
223 if (!ret) 236 if (!ret)
224 ret = err; 237 ret = err;
225 mutex_unlock(&mapping->host->i_mutex); 238 mutex_unlock(&mapping->host->i_mutex);
226 err = filemap_fdatawait(mapping); 239
227 if (!ret)
228 ret = err;
229out: 240out:
230 return ret; 241 return ret;
231} 242}
243EXPORT_SYMBOL(vfs_fsync_range);
244
245/**
246 * vfs_fsync - perform a fsync or fdatasync on a file
247 * @file: file to sync
248 * @dentry: dentry of @file
249 * @datasync: only perform a fdatasync operation
250 *
251 * Write back data and metadata for @file to disk. If @datasync is
252 * set only metadata needed to access modified file data is written.
253 *
254 * In case this function is called from nfsd @file may be %NULL and
255 * only @dentry is set. This can only happen when the filesystem
256 * implements the export_operations API.
257 */
258int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
259{
260 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
261}
232EXPORT_SYMBOL(vfs_fsync); 262EXPORT_SYMBOL(vfs_fsync);
233 263
234static int do_fsync(unsigned int fd, int datasync) 264static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
254 return do_fsync(fd, 1); 284 return do_fsync(fd, 1);
255} 285}
256 286
287/**
288 * generic_write_sync - perform syncing after a write if file / inode is sync
289 * @file: file to which the write happened
290 * @pos: offset where the write started
291 * @count: length of the write
292 *
293 * This is just a simple wrapper about our general syncing function.
294 */
295int generic_write_sync(struct file *file, loff_t pos, loff_t count)
296{
297 if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
298 return 0;
299 return vfs_fsync_range(file, file->f_path.dentry, pos,
300 pos + count - 1, 1);
301}
302EXPORT_SYMBOL(generic_write_sync);
303
257/* 304/*
258 * sys_sync_file_range() permits finely controlled syncing over a segment of 305 * sys_sync_file_range() permits finely controlled syncing over a segment of
259 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is 306 * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 14f2d71ea3ce..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
760const struct inode_operations sysfs_dir_inode_operations = { 760const struct inode_operations sysfs_dir_inode_operations = {
761 .lookup = sysfs_lookup, 761 .lookup = sysfs_lookup,
762 .setattr = sysfs_setattr, 762 .setattr = sysfs_setattr,
763 .setxattr = sysfs_setxattr,
763}; 764};
764 765
765static void remove_dir(struct sysfs_dirent *sd) 766static void remove_dir(struct sysfs_dirent *sd)
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
18#include <linux/capability.h> 18#include <linux/capability.h>
19#include <linux/errno.h> 19#include <linux/errno.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/xattr.h>
22#include <linux/security.h>
21#include "sysfs.h" 23#include "sysfs.h"
22 24
23extern struct super_block * sysfs_sb; 25extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
29}; 31};
30 32
31static struct backing_dev_info sysfs_backing_dev_info = { 33static struct backing_dev_info sysfs_backing_dev_info = {
34 .name = "sysfs",
32 .ra_pages = 0, /* No readahead */ 35 .ra_pages = 0, /* No readahead */
33 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
34}; 37};
35 38
36static const struct inode_operations sysfs_inode_operations ={ 39static const struct inode_operations sysfs_inode_operations ={
37 .setattr = sysfs_setattr, 40 .setattr = sysfs_setattr,
41 .setxattr = sysfs_setxattr,
38}; 42};
39 43
40int __init sysfs_inode_init(void) 44int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
42 return bdi_init(&sysfs_backing_dev_info); 46 return bdi_init(&sysfs_backing_dev_info);
43} 47}
44 48
49struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
50{
51 struct sysfs_inode_attrs *attrs;
52 struct iattr *iattrs;
53
54 attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
55 if (!attrs)
56 return NULL;
57 iattrs = &attrs->ia_iattr;
58
59 /* assign default attributes */
60 iattrs->ia_mode = sd->s_mode;
61 iattrs->ia_uid = 0;
62 iattrs->ia_gid = 0;
63 iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
64
65 return attrs;
66}
45int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) 67int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
46{ 68{
47 struct inode * inode = dentry->d_inode; 69 struct inode * inode = dentry->d_inode;
48 struct sysfs_dirent * sd = dentry->d_fsdata; 70 struct sysfs_dirent * sd = dentry->d_fsdata;
49 struct iattr * sd_iattr; 71 struct sysfs_inode_attrs *sd_attrs;
72 struct iattr *iattrs;
50 unsigned int ia_valid = iattr->ia_valid; 73 unsigned int ia_valid = iattr->ia_valid;
51 int error; 74 int error;
52 75
53 if (!sd) 76 if (!sd)
54 return -EINVAL; 77 return -EINVAL;
55 78
56 sd_iattr = sd->s_iattr; 79 sd_attrs = sd->s_iattr;
57 80
58 error = inode_change_ok(inode, iattr); 81 error = inode_change_ok(inode, iattr);
59 if (error) 82 if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
65 if (error) 88 if (error)
66 return error; 89 return error;
67 90
68 if (!sd_iattr) { 91 if (!sd_attrs) {
69 /* setting attributes for the first time, allocate now */ 92 /* setting attributes for the first time, allocate now */
70 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 93 sd_attrs = sysfs_init_inode_attrs(sd);
71 if (!sd_iattr) 94 if (!sd_attrs)
72 return -ENOMEM; 95 return -ENOMEM;
73 /* assign default attributes */ 96 sd->s_iattr = sd_attrs;
74 sd_iattr->ia_mode = sd->s_mode; 97 } else {
75 sd_iattr->ia_uid = 0; 98 /* attributes were changed at least once in past */
76 sd_iattr->ia_gid = 0; 99 iattrs = &sd_attrs->ia_iattr;
77 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 100
78 sd->s_iattr = sd_iattr; 101 if (ia_valid & ATTR_UID)
102 iattrs->ia_uid = iattr->ia_uid;
103 if (ia_valid & ATTR_GID)
104 iattrs->ia_gid = iattr->ia_gid;
105 if (ia_valid & ATTR_ATIME)
106 iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
107 inode->i_sb->s_time_gran);
108 if (ia_valid & ATTR_MTIME)
109 iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
110 inode->i_sb->s_time_gran);
111 if (ia_valid & ATTR_CTIME)
112 iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
113 inode->i_sb->s_time_gran);
114 if (ia_valid & ATTR_MODE) {
115 umode_t mode = iattr->ia_mode;
116
117 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
118 mode &= ~S_ISGID;
119 iattrs->ia_mode = sd->s_mode = mode;
120 }
79 } 121 }
122 return error;
123}
80 124
81 /* attributes were changed atleast once in past */ 125int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
82 126 size_t size, int flags)
83 if (ia_valid & ATTR_UID) 127{
84 sd_iattr->ia_uid = iattr->ia_uid; 128 struct sysfs_dirent *sd = dentry->d_fsdata;
85 if (ia_valid & ATTR_GID) 129 struct sysfs_inode_attrs *iattrs;
86 sd_iattr->ia_gid = iattr->ia_gid; 130 void *secdata;
87 if (ia_valid & ATTR_ATIME) 131 int error;
88 sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, 132 u32 secdata_len = 0;
89 inode->i_sb->s_time_gran); 133
90 if (ia_valid & ATTR_MTIME) 134 if (!sd)
91 sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, 135 return -EINVAL;
92 inode->i_sb->s_time_gran); 136 if (!sd->s_iattr)
93 if (ia_valid & ATTR_CTIME) 137 sd->s_iattr = sysfs_init_inode_attrs(sd);
94 sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, 138 if (!sd->s_iattr)
95 inode->i_sb->s_time_gran); 139 return -ENOMEM;
96 if (ia_valid & ATTR_MODE) { 140
97 umode_t mode = iattr->ia_mode; 141 iattrs = sd->s_iattr;
98 142
99 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) 143 if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
100 mode &= ~S_ISGID; 144 const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
101 sd_iattr->ia_mode = sd->s_mode = mode; 145 error = security_inode_setsecurity(dentry->d_inode, suffix,
102 } 146 value, size, flags);
147 if (error)
148 goto out;
149 error = security_inode_getsecctx(dentry->d_inode,
150 &secdata, &secdata_len);
151 if (error)
152 goto out;
153 if (iattrs->ia_secdata)
154 security_release_secctx(iattrs->ia_secdata,
155 iattrs->ia_secdata_len);
156 iattrs->ia_secdata = secdata;
157 iattrs->ia_secdata_len = secdata_len;
103 158
159 } else
160 return -EINVAL;
161out:
104 return error; 162 return error;
105} 163}
106 164
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
146static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) 204static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
147{ 205{
148 struct bin_attribute *bin_attr; 206 struct bin_attribute *bin_attr;
207 struct sysfs_inode_attrs *iattrs;
149 208
150 inode->i_private = sysfs_get(sd); 209 inode->i_private = sysfs_get(sd);
151 inode->i_mapping->a_ops = &sysfs_aops; 210 inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
154 inode->i_ino = sd->s_ino; 213 inode->i_ino = sd->s_ino;
155 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); 214 lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
156 215
157 if (sd->s_iattr) { 216 iattrs = sd->s_iattr;
217 if (iattrs) {
158 /* sysfs_dirent has non-default attributes 218 /* sysfs_dirent has non-default attributes
159 * get them for the new inode from persistent copy 219 * get them for the new inode from persistent copy
160 * in sysfs_dirent 220 * in sysfs_dirent
161 */ 221 */
162 set_inode_attr(inode, sd->s_iattr); 222 set_inode_attr(inode, &iattrs->ia_iattr);
223 if (iattrs->ia_secdata)
224 security_inode_notifysecctx(inode,
225 iattrs->ia_secdata,
226 iattrs->ia_secdata_len);
163 } else 227 } else
164 set_default_inode_attr(inode, sd->s_mode); 228 set_default_inode_attr(inode, sd->s_mode);
165 229
166
167 /* initialize inode according to type */ 230 /* initialize inode according to type */
168 switch (sysfs_type(sd)) { 231 switch (sysfs_type(sd)) {
169 case SYSFS_DIR: 232 case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
16#include <linux/kobject.h> 16#include <linux/kobject.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/mutex.h> 18#include <linux/mutex.h>
19#include <linux/security.h>
19 20
20#include "sysfs.h" 21#include "sysfs.h"
21 22
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
209} 210}
210 211
211const struct inode_operations sysfs_symlink_inode_operations = { 212const struct inode_operations sysfs_symlink_inode_operations = {
213 .setxattr = sysfs_setxattr,
212 .readlink = generic_readlink, 214 .readlink = generic_readlink,
213 .follow_link = sysfs_follow_link, 215 .follow_link = sysfs_follow_link,
214 .put_link = sysfs_put_link, 216 .put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
8 * This file is released under the GPLv2. 8 * This file is released under the GPLv2.
9 */ 9 */
10 10
11#include <linux/fs.h>
12
11struct sysfs_open_dirent; 13struct sysfs_open_dirent;
12 14
13/* type-specific structures for sysfs_dirent->s_* union members */ 15/* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
31 struct hlist_head buffers; 33 struct hlist_head buffers;
32}; 34};
33 35
36struct sysfs_inode_attrs {
37 struct iattr ia_iattr;
38 void *ia_secdata;
39 u32 ia_secdata_len;
40};
41
34/* 42/*
35 * sysfs_dirent - the building block of sysfs hierarchy. Each and 43 * sysfs_dirent - the building block of sysfs hierarchy. Each and
36 * every sysfs node is represented by single sysfs_dirent. 44 * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
56 unsigned int s_flags; 64 unsigned int s_flags;
57 ino_t s_ino; 65 ino_t s_ino;
58 umode_t s_mode; 66 umode_t s_mode;
59 struct iattr *s_iattr; 67 struct sysfs_inode_attrs *s_iattr;
60}; 68};
61 69
62#define SD_DEACTIVATED_BIAS INT_MIN 70#define SD_DEACTIVATED_BIAS INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
148struct inode *sysfs_get_inode(struct sysfs_dirent *sd); 156struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
149void sysfs_delete_inode(struct inode *inode); 157void sysfs_delete_inode(struct inode *inode);
150int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); 158int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
159int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
160 size_t size, int flags);
151int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); 161int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
152int sysfs_inode_init(void); 162int sysfs_inode_init(void);
153 163
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index eaf6d891d46f..ee1ce68fd98b 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
54 * @nr_to_write: how many dirty pages to write-back 54 * @nr_to_write: how many dirty pages to write-back
55 * 55 *
56 * This function shrinks UBIFS liability by means of writing back some amount 56 * This function shrinks UBIFS liability by means of writing back some amount
57 * of dirty inodes and their pages. Returns the amount of pages which were 57 * of dirty inodes and their pages.
58 * written back. The returned value does not include dirty inodes which were
59 * synchronized.
60 * 58 *
61 * Note, this function synchronizes even VFS inodes which are locked 59 * Note, this function synchronizes even VFS inodes which are locked
62 * (@i_mutex) by the caller of the budgeting function, because write-back does 60 * (@i_mutex) by the caller of the budgeting function, because write-back does
63 * not touch @i_mutex. 61 * not touch @i_mutex.
64 */ 62 */
65static int shrink_liability(struct ubifs_info *c, int nr_to_write) 63static void shrink_liability(struct ubifs_info *c, int nr_to_write)
66{ 64{
67 int nr_written; 65 writeback_inodes_sb(c->vfs_sb);
68 struct writeback_control wbc = {
69 .sync_mode = WB_SYNC_NONE,
70 .range_end = LLONG_MAX,
71 .nr_to_write = nr_to_write,
72 };
73
74 generic_sync_sb_inodes(c->vfs_sb, &wbc);
75 nr_written = nr_to_write - wbc.nr_to_write;
76
77 if (!nr_written) {
78 /*
79 * Re-try again but wait on pages/inodes which are being
80 * written-back concurrently (e.g., by pdflush).
81 */
82 memset(&wbc, 0, sizeof(struct writeback_control));
83 wbc.sync_mode = WB_SYNC_ALL;
84 wbc.range_end = LLONG_MAX;
85 wbc.nr_to_write = nr_to_write;
86 generic_sync_sb_inodes(c->vfs_sb, &wbc);
87 nr_written = nr_to_write - wbc.nr_to_write;
88 }
89
90 dbg_budg("%d pages were written back", nr_written);
91 return nr_written;
92} 66}
93 67
94/** 68/**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 26d2e0d80465..c4af069df1ad 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
438{ 438{
439 int i, err; 439 int i, err;
440 struct ubifs_info *c = sb->s_fs_info; 440 struct ubifs_info *c = sb->s_fs_info;
441 struct writeback_control wbc = {
442 .sync_mode = WB_SYNC_ALL,
443 .range_start = 0,
444 .range_end = LLONG_MAX,
445 .nr_to_write = LONG_MAX,
446 };
447 441
448 /* 442 /*
449 * Zero @wait is just an advisory thing to help the file system shove 443 * Zero @wait is just an advisory thing to help the file system shove
@@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
462 * the user be able to get more accurate results of 'statfs()' after 456 * the user be able to get more accurate results of 'statfs()' after
463 * they synchronize the file system. 457 * they synchronize the file system.
464 */ 458 */
465 generic_sync_sb_inodes(sb, &wbc); 459 sync_inodes_sb(sb);
466 460
467 /* 461 /*
468 * Synchronize write buffers, because 'ubifs_run_commit()' does not 462 * Synchronize write buffers, because 'ubifs_run_commit()' does not
@@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1971 * 1965 *
1972 * Read-ahead will be disabled because @c->bdi.ra_pages is 0. 1966 * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
1973 */ 1967 */
1968 c->bdi.name = "ubifs",
1974 c->bdi.capabilities = BDI_CAP_MAP_COPY; 1969 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1975 c->bdi.unplug_io_fn = default_unplug_io_fn; 1970 c->bdi.unplug_io_fn = default_unplug_io_fn;
1976 err = bdi_init(&c->bdi); 1971 err = bdi_init(&c->bdi);
@@ -1985,6 +1980,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1985 if (err) 1980 if (err)
1986 goto out_bdi; 1981 goto out_bdi;
1987 1982
1983 sb->s_bdi = &c->bdi;
1988 sb->s_fs_info = c; 1984 sb->s_fs_info = c;
1989 sb->s_magic = UBIFS_SUPER_MAGIC; 1985 sb->s_magic = UBIFS_SUPER_MAGIC;
1990 sb->s_blocksize = UBIFS_BLOCK_SIZE; 1986 sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20 20
21#if 0
22static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
23 uint8_t ad_size, struct kernel_lb_addr fe_loc,
24 int *pos, int *offset, struct buffer_head **bh,
25 int *error)
26{
27 int loffset = *offset;
28 int block;
29 uint8_t *ad;
30 int remainder;
31
32 *error = 0;
33
34 ad = (uint8_t *)(*bh)->b_data + *offset;
35 *offset += ad_size;
36
37 if (!ad) {
38 brelse(*bh);
39 *error = 1;
40 return NULL;
41 }
42
43 if (*offset == dir->i_sb->s_blocksize) {
44 brelse(*bh);
45 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
46 if (!block)
47 return NULL;
48 *bh = udf_tread(dir->i_sb, block);
49 if (!*bh)
50 return NULL;
51 } else if (*offset > dir->i_sb->s_blocksize) {
52 ad = tmpad;
53
54 remainder = dir->i_sb->s_blocksize - loffset;
55 memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
56
57 brelse(*bh);
58 block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
59 if (!block)
60 return NULL;
61 (*bh) = udf_tread(dir->i_sb, block);
62 if (!*bh)
63 return NULL;
64
65 memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
66 ad_size - remainder);
67 *offset = ad_size - remainder;
68 }
69
70 return ad;
71}
72#endif
73
74struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, 21struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
75 struct udf_fileident_bh *fibh, 22 struct udf_fileident_bh *fibh,
76 struct fileIdentDesc *cfi, 23 struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
248 return fi; 195 return fi;
249} 196}
250 197
251#if 0
252static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
253{
254 struct extent_ad *ext;
255 struct fileEntry *fe;
256 uint8_t *ptr;
257
258 if ((!buffer) || (!offset)) {
259 printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
260 return NULL;
261 }
262
263 fe = (struct fileEntry *)buffer;
264
265 if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
266 udf_debug("0x%x != TAG_IDENT_FE\n",
267 le16_to_cpu(fe->descTag.tagIdent));
268 return NULL;
269 }
270
271 ptr = (uint8_t *)(fe->extendedAttr) +
272 le32_to_cpu(fe->lengthExtendedAttr);
273
274 if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
275 ptr += *offset;
276
277 ext = (struct extent_ad *)ptr;
278
279 *offset = *offset + sizeof(struct extent_ad);
280 return ext;
281}
282#endif
283
284struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, 198struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
285 int inc) 199 int inc)
286{ 200{
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
193static int udf_release_file(struct inode *inode, struct file *filp) 193static int udf_release_file(struct inode *inode, struct file *filp)
194{ 194{
195 if (filp->f_mode & FMODE_WRITE) { 195 if (filp->f_mode & FMODE_WRITE) {
196 mutex_lock(&inode->i_mutex);
196 lock_kernel(); 197 lock_kernel();
197 udf_discard_prealloc(inode); 198 udf_discard_prealloc(inode);
198 unlock_kernel(); 199 unlock_kernel();
200 mutex_unlock(&inode->i_mutex);
199 } 201 }
200 return 0; 202 return 0;
201} 203}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
90} 90}
91 91
92/* 92/*
93 * If we are going to release inode from memory, we discard preallocation and 93 * If we are going to release inode from memory, we truncate last inode extent
94 * truncate last inode extent to proper length. We could use drop_inode() but 94 * to proper length. We could use drop_inode() but it's called under inode_lock
95 * it's called under inode_lock and thus we cannot mark inode dirty there. We 95 * and thus we cannot mark inode dirty there. We use clear_inode() but we have
96 * use clear_inode() but we have to make sure to write inode as it's not written 96 * to make sure to write inode as it's not written automatically.
97 * automatically.
98 */ 97 */
99void udf_clear_inode(struct inode *inode) 98void udf_clear_inode(struct inode *inode)
100{ 99{
101 struct udf_inode_info *iinfo; 100 struct udf_inode_info *iinfo;
102 if (!(inode->i_sb->s_flags & MS_RDONLY)) { 101 if (!(inode->i_sb->s_flags & MS_RDONLY)) {
103 lock_kernel(); 102 lock_kernel();
104 /* Discard preallocation for directories, symlinks, etc. */
105 udf_discard_prealloc(inode);
106 udf_truncate_tail_extent(inode); 103 udf_truncate_tail_extent(inode);
107 unlock_kernel(); 104 unlock_kernel();
108 write_inode_now(inode, 0); 105 write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
664 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); 661 udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
665 662
666#ifdef UDF_PREALLOCATE 663#ifdef UDF_PREALLOCATE
667 /* preallocate blocks */ 664 /* We preallocate blocks only for regular files. It also makes sense
668 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); 665 * for directories but there's a problem when to drop the
666 * preallocation. We might use some delayed work for that but I feel
667 * it's overengineering for a filesystem like UDF. */
668 if (S_ISREG(inode->i_mode))
669 udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
669#endif 670#endif
670 671
671 /* merge any continuous blocks in laarr */ 672 /* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
36 ms_info.addr_format = CDROM_LBA; 36 ms_info.addr_format = CDROM_LBA;
37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); 37 i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
38 38
39#define WE_OBEY_THE_WRITTEN_STANDARDS 1
40
41 if (i == 0) { 39 if (i == 0) {
42 udf_debug("XA disk: %s, vol_desc_start=%d\n", 40 udf_debug("XA disk: %s, vol_desc_start=%d\n",
43 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); 41 (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
44#if WE_OBEY_THE_WRITTEN_STANDARDS
45 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ 42 if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
46#endif
47 vol_desc_start = ms_info.addr.lba; 43 vol_desc_start = ms_info.addr.lba;
48 } else { 44 } else {
49 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); 45 udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
943 pc->componentType = 1; 943 pc->componentType = 1;
944 pc->lengthComponentIdent = 0; 944 pc->lengthComponentIdent = 0;
945 pc->componentFileVersionNum = 0; 945 pc->componentFileVersionNum = 0;
946 pc += sizeof(struct pathComponent);
947 elen += sizeof(struct pathComponent); 946 elen += sizeof(struct pathComponent);
948 } 947 }
949 948
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
66 return inode_permission(inode, mask); 66 return inode_permission(inode, mask);
67} 67}
68 68
69int 69/**
70vfs_setxattr(struct dentry *dentry, const char *name, const void *value, 70 * __vfs_setxattr_noperm - perform setxattr operation without performing
71 size_t size, int flags) 71 * permission checks.
72 *
73 * @dentry - object to perform setxattr on
74 * @name - xattr name to set
75 * @value - value to set @name to
76 * @size - size of @value
77 * @flags - flags to pass into filesystem operations
78 *
79 * returns the result of the internal setxattr or setsecurity operations.
80 *
81 * This function requires the caller to lock the inode's i_mutex before it
82 * is executed. It also assumes that the caller will make the appropriate
83 * permission checks.
84 */
85int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
86 const void *value, size_t size, int flags)
72{ 87{
73 struct inode *inode = dentry->d_inode; 88 struct inode *inode = dentry->d_inode;
74 int error; 89 int error = -EOPNOTSUPP;
75
76 error = xattr_permission(inode, name, MAY_WRITE);
77 if (error)
78 return error;
79 90
80 mutex_lock(&inode->i_mutex);
81 error = security_inode_setxattr(dentry, name, value, size, flags);
82 if (error)
83 goto out;
84 error = -EOPNOTSUPP;
85 if (inode->i_op->setxattr) { 91 if (inode->i_op->setxattr) {
86 error = inode->i_op->setxattr(dentry, name, value, size, flags); 92 error = inode->i_op->setxattr(dentry, name, value, size, flags);
87 if (!error) { 93 if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
97 if (!error) 103 if (!error)
98 fsnotify_xattr(dentry); 104 fsnotify_xattr(dentry);
99 } 105 }
106
107 return error;
108}
109
110
111int
112vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
113 size_t size, int flags)
114{
115 struct inode *inode = dentry->d_inode;
116 int error;
117
118 error = xattr_permission(inode, name, MAY_WRITE);
119 if (error)
120 return error;
121
122 mutex_lock(&inode->i_mutex);
123 error = security_inode_setxattr(dentry, name, value, size, flags);
124 if (error)
125 goto out;
126
127 error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
128
100out: 129out:
101 mutex_unlock(&inode->i_mutex); 130 mutex_unlock(&inode->i_mutex);
102 return error; 131 return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index aecf2519db76..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
216 if (ip->i_d.di_size < isize) { 216 if (ip->i_d.di_size < isize) {
217 ip->i_d.di_size = isize; 217 ip->i_d.di_size = isize;
218 ip->i_update_core = 1; 218 ip->i_update_core = 1;
219 ip->i_update_size = 1;
220 xfs_mark_inode_dirty_sync(ip); 219 xfs_mark_inode_dirty_sync(ip);
221 } 220 }
222 221
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
172 */ 172 */
173STATIC int 173STATIC int
174xfs_file_fsync( 174xfs_file_fsync(
175 struct file *filp, 175 struct file *file,
176 struct dentry *dentry, 176 struct dentry *dentry,
177 int datasync) 177 int datasync)
178{ 178{
179 xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED); 179 struct inode *inode = dentry->d_inode;
180 return -xfs_fsync(XFS_I(dentry->d_inode)); 180 struct xfs_inode *ip = XFS_I(inode);
181 int error;
182
183 /* capture size updates in I/O completion before writing the inode. */
184 error = filemap_fdatawait(inode->i_mapping);
185 if (error)
186 return error;
187
188 xfs_iflags_clear(ip, XFS_ITRUNCATED);
189 return -xfs_fsync(ip);
181} 190}
182 191
183STATIC int 192STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
619 case XFS_IOC_GETVERSION_32: 619 case XFS_IOC_GETVERSION_32:
620 cmd = _NATIVE_IOC(cmd, long); 620 cmd = _NATIVE_IOC(cmd, long);
621 return xfs_file_ioctl(filp, cmd, p); 621 return xfs_file_ioctl(filp, cmd, p);
622 case XFS_IOC_SWAPEXT: { 622 case XFS_IOC_SWAPEXT_32: {
623 struct xfs_swapext sxp; 623 struct xfs_swapext sxp;
624 struct compat_xfs_swapext __user *sxu = arg; 624 struct compat_xfs_swapext __user *sxu = arg;
625 625
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 8070b34cc287..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_itable.h" 44#include "xfs_itable.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_acl.h"
47#include "xfs_attr.h" 46#include "xfs_attr.h"
48#include "xfs_buf_item.h" 47#include "xfs_buf_item.h"
49#include "xfs_utils.h" 48#include "xfs_utils.h"
@@ -485,14 +484,6 @@ xfs_vn_put_link(
485} 484}
486 485
487STATIC int 486STATIC int
488xfs_vn_permission(
489 struct inode *inode,
490 int mask)
491{
492 return generic_permission(inode, mask, xfs_check_acl);
493}
494
495STATIC int
496xfs_vn_getattr( 487xfs_vn_getattr(
497 struct vfsmount *mnt, 488 struct vfsmount *mnt,
498 struct dentry *dentry, 489 struct dentry *dentry,
@@ -696,7 +687,7 @@ xfs_vn_fiemap(
696} 687}
697 688
698static const struct inode_operations xfs_inode_operations = { 689static const struct inode_operations xfs_inode_operations = {
699 .permission = xfs_vn_permission, 690 .check_acl = xfs_check_acl,
700 .truncate = xfs_vn_truncate, 691 .truncate = xfs_vn_truncate,
701 .getattr = xfs_vn_getattr, 692 .getattr = xfs_vn_getattr,
702 .setattr = xfs_vn_setattr, 693 .setattr = xfs_vn_setattr,
@@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
724 .rmdir = xfs_vn_unlink, 715 .rmdir = xfs_vn_unlink,
725 .mknod = xfs_vn_mknod, 716 .mknod = xfs_vn_mknod,
726 .rename = xfs_vn_rename, 717 .rename = xfs_vn_rename,
727 .permission = xfs_vn_permission, 718 .check_acl = xfs_check_acl,
728 .getattr = xfs_vn_getattr, 719 .getattr = xfs_vn_getattr,
729 .setattr = xfs_vn_setattr, 720 .setattr = xfs_vn_setattr,
730 .setxattr = generic_setxattr, 721 .setxattr = generic_setxattr,
@@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
749 .rmdir = xfs_vn_unlink, 740 .rmdir = xfs_vn_unlink,
750 .mknod = xfs_vn_mknod, 741 .mknod = xfs_vn_mknod,
751 .rename = xfs_vn_rename, 742 .rename = xfs_vn_rename,
752 .permission = xfs_vn_permission, 743 .check_acl = xfs_check_acl,
753 .getattr = xfs_vn_getattr, 744 .getattr = xfs_vn_getattr,
754 .setattr = xfs_vn_setattr, 745 .setattr = xfs_vn_setattr,
755 .setxattr = generic_setxattr, 746 .setxattr = generic_setxattr,
@@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
762 .readlink = generic_readlink, 753 .readlink = generic_readlink,
763 .follow_link = xfs_vn_follow_link, 754 .follow_link = xfs_vn_follow_link,
764 .put_link = xfs_vn_put_link, 755 .put_link = xfs_vn_put_link,
765 .permission = xfs_vn_permission, 756 .check_acl = xfs_check_acl,
766 .getattr = xfs_vn_getattr, 757 .getattr = xfs_vn_getattr,
767 .setattr = xfs_vn_setattr, 758 .setattr = xfs_vn_setattr,
768 .setxattr = generic_setxattr, 759 .setxattr = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
812 812
813 /* Handle various SYNC-type writes */ 813 /* Handle various SYNC-type writes */
814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 814 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
815 loff_t end = pos + ret - 1;
815 int error2; 816 int error2;
816 817
817 xfs_iunlock(xip, iolock); 818 xfs_iunlock(xip, iolock);
818 if (need_i_mutex) 819 if (need_i_mutex)
819 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
820 error2 = sync_page_range(inode, mapping, pos, ret); 821
822 error2 = filemap_write_and_wait_range(mapping, pos, end);
821 if (!error) 823 if (!error)
822 error = error2; 824 error = error2;
823 if (need_i_mutex) 825 if (need_i_mutex)
824 mutex_lock(&inode->i_mutex); 826 mutex_lock(&inode->i_mutex);
825 xfs_ilock(xip, iolock); 827 xfs_ilock(xip, iolock);
826 error2 = xfs_write_sync_logforce(mp, xip); 828
829 error2 = xfs_fsync(xip);
827 if (!error) 830 if (!error)
828 error = error2; 831 error = error2;
829 } 832 }
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
20 20
21DEFINE_PER_CPU(struct xfsstats, xfsstats); 21DEFINE_PER_CPU(struct xfsstats, xfsstats);
22 22
23STATIC int 23static int xfs_stat_proc_show(struct seq_file *m, void *v)
24xfs_read_xfsstats(
25 char *buffer,
26 char **start,
27 off_t offset,
28 int count,
29 int *eof,
30 void *data)
31{ 24{
32 int c, i, j, len, val; 25 int c, i, j, val;
33 __uint64_t xs_xstrat_bytes = 0; 26 __uint64_t xs_xstrat_bytes = 0;
34 __uint64_t xs_write_bytes = 0; 27 __uint64_t xs_write_bytes = 0;
35 __uint64_t xs_read_bytes = 0; 28 __uint64_t xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
60 }; 53 };
61 54
62 /* Loop over all stats groups */ 55 /* Loop over all stats groups */
63 for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) { 56 for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
64 len += sprintf(buffer + len, "%s", xstats[i].desc); 57 seq_printf(m, "%s", xstats[i].desc);
65 /* inner loop does each group */ 58 /* inner loop does each group */
66 while (j < xstats[i].endpoint) { 59 while (j < xstats[i].endpoint) {
67 val = 0; 60 val = 0;
68 /* sum over all cpus */ 61 /* sum over all cpus */
69 for_each_possible_cpu(c) 62 for_each_possible_cpu(c)
70 val += *(((__u32*)&per_cpu(xfsstats, c) + j)); 63 val += *(((__u32*)&per_cpu(xfsstats, c) + j));
71 len += sprintf(buffer + len, " %u", val); 64 seq_printf(m, " %u", val);
72 j++; 65 j++;
73 } 66 }
74 buffer[len++] = '\n'; 67 seq_putc(m, '\n');
75 } 68 }
76 /* extra precision counters */ 69 /* extra precision counters */
77 for_each_possible_cpu(i) { 70 for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
80 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; 73 xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
81 } 74 }
82 75
83 len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n", 76 seq_printf(m, "xpc %Lu %Lu %Lu\n",
84 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); 77 xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
85 len += sprintf(buffer + len, "debug %u\n", 78 seq_printf(m, "debug %u\n",
86#if defined(DEBUG) 79#if defined(DEBUG)
87 1); 80 1);
88#else 81#else
89 0); 82 0);
90#endif 83#endif
84 return 0;
85}
91 86
92 if (offset >= len) { 87static int xfs_stat_proc_open(struct inode *inode, struct file *file)
93 *start = buffer; 88{
94 *eof = 1; 89 return single_open(file, xfs_stat_proc_show, NULL);
95 return 0;
96 }
97 *start = buffer + offset;
98 if ((len -= offset) > count)
99 return count;
100 *eof = 1;
101
102 return len;
103} 90}
104 91
92static const struct file_operations xfs_stat_proc_fops = {
93 .owner = THIS_MODULE,
94 .open = xfs_stat_proc_open,
95 .read = seq_read,
96 .llseek = seq_lseek,
97 .release = single_release,
98};
99
105int 100int
106xfs_init_procfs(void) 101xfs_init_procfs(void)
107{ 102{
108 if (!proc_mkdir("fs/xfs", NULL)) 103 if (!proc_mkdir("fs/xfs", NULL))
109 goto out; 104 goto out;
110 105
111 if (!create_proc_read_entry("fs/xfs/stat", 0, NULL, 106 if (!proc_create("fs/xfs/stat", 0, NULL,
112 xfs_read_xfsstats, NULL)) 107 &xfs_stat_proc_fops))
113 goto out_remove_entry; 108 goto out_remove_entry;
114 return 0; 109 return 0;
115 110
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
579 else if (mp->m_qflags & XFS_UQUOTA_ACCT) 579 else if (mp->m_qflags & XFS_UQUOTA_ACCT)
580 seq_puts(m, "," MNTOPT_UQUOTANOENF); 580 seq_puts(m, "," MNTOPT_UQUOTANOENF);
581 581
582 if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD)) 582 /* Either project or group quotas can be active, not both */
583 seq_puts(m, "," MNTOPT_PRJQUOTA); 583
584 else if (mp->m_qflags & XFS_PQUOTA_ACCT) 584 if (mp->m_qflags & XFS_PQUOTA_ACCT) {
585 seq_puts(m, "," MNTOPT_PQUOTANOENF); 585 if (mp->m_qflags & XFS_OQUOTA_ENFD)
586 586 seq_puts(m, "," MNTOPT_PRJQUOTA);
587 if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD)) 587 else
588 seq_puts(m, "," MNTOPT_GRPQUOTA); 588 seq_puts(m, "," MNTOPT_PQUOTANOENF);
589 else if (mp->m_qflags & XFS_GQUOTA_ACCT) 589 } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
590 seq_puts(m, "," MNTOPT_GQUOTANOENF); 590 if (mp->m_qflags & XFS_OQUOTA_ENFD)
591 seq_puts(m, "," MNTOPT_GRPQUOTA);
592 else
593 seq_puts(m, "," MNTOPT_GQUOTANOENF);
594 }
591 595
592 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) 596 if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
593 seq_puts(m, "," MNTOPT_NOQUOTA); 597 seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
687 return error; 691 return error;
688} 692}
689 693
690void 694STATIC void
691xfs_mountfs_check_barriers(xfs_mount_t *mp) 695xfs_mountfs_check_barriers(xfs_mount_t *mp)
692{ 696{
693 int error; 697 int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 98ef624d9baf..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -749,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 749 XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
750} 750}
751 751
752void
753xfs_inode_clear_reclaim_tag(
754 xfs_inode_t *ip)
755{
756 xfs_mount_t *mp = ip->i_mount;
757 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
758
759 read_lock(&pag->pag_ici_lock);
760 spin_lock(&ip->i_flags_lock);
761 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
762 spin_unlock(&ip->i_flags_lock);
763 read_unlock(&pag->pag_ici_lock);
764 xfs_put_perag(mp, pag);
765}
766
767STATIC int 752STATIC int
768xfs_reclaim_inode_now( 753xfs_reclaim_inode_now(
769 struct xfs_inode *ip, 754 struct xfs_inode *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 59120602588a..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -49,7 +49,6 @@ int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
49 49
50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); 50void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); 51void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
52void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
53void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, 52void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
54 struct xfs_inode *ip); 53 struct xfs_inode *ip);
55 54
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
48 48
49struct xqmstats xqmstats; 49struct xqmstats xqmstats;
50 50
51STATIC int 51static int xqm_proc_show(struct seq_file *m, void *v)
52xfs_qm_read_xfsquota(
53 char *buffer,
54 char **start,
55 off_t offset,
56 int count,
57 int *eof,
58 void *data)
59{ 52{
60 int len;
61
62 /* maximum; incore; ratio free to inuse; freelist */ 53 /* maximum; incore; ratio free to inuse; freelist */
63 len = sprintf(buffer, "%d\t%d\t%d\t%u\n", 54 seq_printf(m, "%d\t%d\t%d\t%u\n",
64 ndquot, 55 ndquot,
65 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0, 56 xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
66 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0, 57 xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
67 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0); 58 xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
68 59 return 0;
69 if (offset >= len) {
70 *start = buffer;
71 *eof = 1;
72 return 0;
73 }
74 *start = buffer + offset;
75 if ((len -= offset) > count)
76 return count;
77 *eof = 1;
78
79 return len;
80} 60}
81 61
82STATIC int 62static int xqm_proc_open(struct inode *inode, struct file *file)
83xfs_qm_read_stats(
84 char *buffer,
85 char **start,
86 off_t offset,
87 int count,
88 int *eof,
89 void *data)
90{ 63{
91 int len; 64 return single_open(file, xqm_proc_show, NULL);
65}
66
67static const struct file_operations xqm_proc_fops = {
68 .owner = THIS_MODULE,
69 .open = xqm_proc_open,
70 .read = seq_read,
71 .llseek = seq_lseek,
72 .release = single_release,
73};
92 74
75static int xqmstat_proc_show(struct seq_file *m, void *v)
76{
93 /* quota performance statistics */ 77 /* quota performance statistics */
94 len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n", 78 seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
95 xqmstats.xs_qm_dqreclaims, 79 xqmstats.xs_qm_dqreclaims,
96 xqmstats.xs_qm_dqreclaim_misses, 80 xqmstats.xs_qm_dqreclaim_misses,
97 xqmstats.xs_qm_dquot_dups, 81 xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
100 xqmstats.xs_qm_dqwants, 84 xqmstats.xs_qm_dqwants,
101 xqmstats.xs_qm_dqshake_reclaims, 85 xqmstats.xs_qm_dqshake_reclaims,
102 xqmstats.xs_qm_dqinact_reclaims); 86 xqmstats.xs_qm_dqinact_reclaims);
87 return 0;
88}
103 89
104 if (offset >= len) { 90static int xqmstat_proc_open(struct inode *inode, struct file *file)
105 *start = buffer; 91{
106 *eof = 1; 92 return single_open(file, xqmstat_proc_show, NULL);
107 return 0;
108 }
109 *start = buffer + offset;
110 if ((len -= offset) > count)
111 return count;
112 *eof = 1;
113
114 return len;
115} 93}
116 94
95static const struct file_operations xqmstat_proc_fops = {
96 .owner = THIS_MODULE,
97 .open = xqmstat_proc_open,
98 .read = seq_read,
99 .llseek = seq_lseek,
100 .release = single_release,
101};
102
117void 103void
118xfs_qm_init_procfs(void) 104xfs_qm_init_procfs(void)
119{ 105{
120 create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL); 106 proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
121 create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL); 107 proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
122} 108}
123 109
124void 110void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
198 xfs_agino_t pagi_count; /* number of allocated inodes */ 198 xfs_agino_t pagi_count; /* number of allocated inodes */
199 int pagb_count; /* pagb slots in use */ 199 int pagb_count; /* pagb slots in use */
200 xfs_perag_busy_t *pagb_list; /* unstable blocks */ 200 xfs_perag_busy_t *pagb_list; /* unstable blocks */
201
202 /*
203 * Inode allocation search lookup optimisation.
204 * If the pagino matches, the search for new inodes
205 * doesn't need to search the near ones again straight away
206 */
207 xfs_agino_t pagl_pagino;
208 xfs_agino_t pagl_leftrec;
209 xfs_agino_t pagl_rightrec;
201#ifdef __KERNEL__ 210#ifdef __KERNEL__
202 spinlock_t pagb_lock; /* lock for pagb_list */ 211 spinlock_t pagb_lock; /* lock for pagb_list */
203 212
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 8ee5b5a76a2a..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
3713 * entry (null if none). Else, *lastxp will be set to the index 3713 * entry (null if none). Else, *lastxp will be set to the index
3714 * of the found entry; *gotp will contain the entry. 3714 * of the found entry; *gotp will contain the entry.
3715 */ 3715 */
3716xfs_bmbt_rec_host_t * /* pointer to found extent entry */ 3716STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */
3717xfs_bmap_search_multi_extents( 3717xfs_bmap_search_multi_extents(
3718 xfs_ifork_t *ifp, /* inode fork pointer */ 3718 xfs_ifork_t *ifp, /* inode fork pointer */
3719 xfs_fileoff_t bno, /* block number searched for */ 3719 xfs_fileoff_t bno, /* block number searched for */
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
392 int whichfork, 392 int whichfork,
393 int *count); 393 int *count);
394 394
395/*
396 * Search the extent records for the entry containing block bno.
397 * If bno lies in a hole, point to the next entry. If bno lies
398 * past eof, *eofp will be set, and *prevp will contain the last
399 * entry (null if none). Else, *lastxp will be set to the index
400 * of the found entry; *gotp will contain the entry.
401 */
402xfs_bmbt_rec_host_t *
403xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
404 xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
405
406#endif /* __KERNEL__ */ 395#endif /* __KERNEL__ */
407 396
408#endif /* __XFS_BMAP_H__ */ 397#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
202 ext_flag); 202 ext_flag);
203} 203}
204 204
205/* Endian flipping versions of the bmbt extraction functions */
206void
207xfs_bmbt_disk_get_all(
208 xfs_bmbt_rec_t *r,
209 xfs_bmbt_irec_t *s)
210{
211 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
212 get_unaligned_be64(&r->l1), s);
213}
214
215/* 205/*
216 * Extract the blockcount field from an on disk bmap extent record. 206 * Extract the blockcount field from an on disk bmap extent record.
217 */ 207 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
816 *l1 = 0; 806 *l1 = 0;
817} 807}
818 808
809/* Endian flipping versions of the bmbt extraction functions */
810STATIC void
811xfs_bmbt_disk_get_all(
812 xfs_bmbt_rec_t *r,
813 xfs_bmbt_irec_t *s)
814{
815 __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
816 get_unaligned_be64(&r->l1), s);
817}
818
819STATIC void 819STATIC void
820xfs_bmbt_trace_record( 820xfs_bmbt_trace_record(
821 struct xfs_btree_cur *cur, 821 struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r); 220extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r); 221extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
222 222
223extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
224extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); 223extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
225extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); 224extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
226 225
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 26717388acf5..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
646} 646}
647 647
648/* 648/*
649 * Get a buffer for the block, return it read in.
650 * Short-form addressing.
651 */
652int /* error */
653xfs_btree_read_bufs(
654 xfs_mount_t *mp, /* file system mount point */
655 xfs_trans_t *tp, /* transaction pointer */
656 xfs_agnumber_t agno, /* allocation group number */
657 xfs_agblock_t agbno, /* allocation group block number */
658 uint lock, /* lock flags for read_buf */
659 xfs_buf_t **bpp, /* buffer for agno/agbno */
660 int refval) /* ref count value for buffer */
661{
662 xfs_buf_t *bp; /* return value */
663 xfs_daddr_t d; /* real disk block address */
664 int error;
665
666 ASSERT(agno != NULLAGNUMBER);
667 ASSERT(agbno != NULLAGBLOCK);
668 d = XFS_AGB_TO_DADDR(mp, agno, agbno);
669 if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
670 mp->m_bsize, lock, &bp))) {
671 return error;
672 }
673 ASSERT(!bp || !XFS_BUF_GETERROR(bp));
674 if (bp != NULL) {
675 switch (refval) {
676 case XFS_ALLOC_BTREE_REF:
677 XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
678 break;
679 case XFS_INO_BTREE_REF:
680 XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
681 break;
682 }
683 }
684 *bpp = bp;
685 return 0;
686}
687
688/*
689 * Read-ahead the block, don't wait for it, don't return a buffer. 649 * Read-ahead the block, don't wait for it, don't return a buffer.
690 * Long-form addressing. 650 * Long-form addressing.
691 */ 651 */
@@ -2951,7 +2911,7 @@ error0:
2951 * inode we have to copy the single block it was pointing to into the 2911 * inode we have to copy the single block it was pointing to into the
2952 * inode. 2912 * inode.
2953 */ 2913 */
2954int 2914STATIC int
2955xfs_btree_kill_iroot( 2915xfs_btree_kill_iroot(
2956 struct xfs_btree_cur *cur) 2916 struct xfs_btree_cur *cur)
2957{ 2917{
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
379 int refval);/* ref count value for buffer */ 379 int refval);/* ref count value for buffer */
380 380
381/* 381/*
382 * Get a buffer for the block, return it read in.
383 * Short-form addressing.
384 */
385int /* error */
386xfs_btree_read_bufs(
387 struct xfs_mount *mp, /* file system mount point */
388 struct xfs_trans *tp, /* transaction pointer */
389 xfs_agnumber_t agno, /* allocation group number */
390 xfs_agblock_t agbno, /* allocation group block number */
391 uint lock, /* lock flags for read_buf */
392 struct xfs_buf **bpp, /* buffer for agno/agbno */
393 int refval);/* ref count value for buffer */
394
395/*
396 * Read-ahead the block, don't wait for it, don't return a buffer. 382 * Read-ahead the block, don't wait for it, don't return a buffer.
397 * Long-form addressing. 383 * Long-form addressing.
398 */ 384 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
432int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *); 418int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
433int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *); 419int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
434int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *); 420int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
435int xfs_btree_kill_iroot(struct xfs_btree_cur *);
436int xfs_btree_insert(struct xfs_btree_cur *, int *); 421int xfs_btree_insert(struct xfs_btree_cur *, int *);
437int xfs_btree_delete(struct xfs_btree_cur *, int *); 422int xfs_btree_delete(struct xfs_btree_cur *, int *);
438int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *); 423int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
57} 57}
58 58
59/* 59/*
60 * Lookup the record equal to ino in the btree given by cur. 60 * Lookup a record by ino in the btree given by cur.
61 */
62STATIC int /* error */
63xfs_inobt_lookup_eq(
64 struct xfs_btree_cur *cur, /* btree cursor */
65 xfs_agino_t ino, /* starting inode of chunk */
66 __int32_t fcnt, /* free inode count */
67 xfs_inofree_t free, /* free inode mask */
68 int *stat) /* success/failure */
69{
70 cur->bc_rec.i.ir_startino = ino;
71 cur->bc_rec.i.ir_freecount = fcnt;
72 cur->bc_rec.i.ir_free = free;
73 return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
74}
75
76/*
77 * Lookup the first record greater than or equal to ino
78 * in the btree given by cur.
79 */ 61 */
80int /* error */ 62int /* error */
81xfs_inobt_lookup_ge( 63xfs_inobt_lookup(
82 struct xfs_btree_cur *cur, /* btree cursor */ 64 struct xfs_btree_cur *cur, /* btree cursor */
83 xfs_agino_t ino, /* starting inode of chunk */ 65 xfs_agino_t ino, /* starting inode of chunk */
84 __int32_t fcnt, /* free inode count */ 66 xfs_lookup_t dir, /* <=, >=, == */
85 xfs_inofree_t free, /* free inode mask */
86 int *stat) /* success/failure */ 67 int *stat) /* success/failure */
87{ 68{
88 cur->bc_rec.i.ir_startino = ino; 69 cur->bc_rec.i.ir_startino = ino;
89 cur->bc_rec.i.ir_freecount = fcnt; 70 cur->bc_rec.i.ir_freecount = 0;
90 cur->bc_rec.i.ir_free = free; 71 cur->bc_rec.i.ir_free = 0;
91 return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 72 return xfs_btree_lookup(cur, dir, stat);
92} 73}
93 74
94/* 75/*
95 * Lookup the first record less than or equal to ino 76 * Update the record referred to by cur to the value given.
96 * in the btree given by cur.
97 */
98int /* error */
99xfs_inobt_lookup_le(
100 struct xfs_btree_cur *cur, /* btree cursor */
101 xfs_agino_t ino, /* starting inode of chunk */
102 __int32_t fcnt, /* free inode count */
103 xfs_inofree_t free, /* free inode mask */
104 int *stat) /* success/failure */
105{
106 cur->bc_rec.i.ir_startino = ino;
107 cur->bc_rec.i.ir_freecount = fcnt;
108 cur->bc_rec.i.ir_free = free;
109 return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
110}
111
112/*
113 * Update the record referred to by cur to the value given
114 * by [ino, fcnt, free].
115 * This either works (return 0) or gets an EFSCORRUPTED error. 77 * This either works (return 0) or gets an EFSCORRUPTED error.
116 */ 78 */
117STATIC int /* error */ 79STATIC int /* error */
118xfs_inobt_update( 80xfs_inobt_update(
119 struct xfs_btree_cur *cur, /* btree cursor */ 81 struct xfs_btree_cur *cur, /* btree cursor */
120 xfs_agino_t ino, /* starting inode of chunk */ 82 xfs_inobt_rec_incore_t *irec) /* btree record */
121 __int32_t fcnt, /* free inode count */
122 xfs_inofree_t free) /* free inode mask */
123{ 83{
124 union xfs_btree_rec rec; 84 union xfs_btree_rec rec;
125 85
126 rec.inobt.ir_startino = cpu_to_be32(ino); 86 rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
127 rec.inobt.ir_freecount = cpu_to_be32(fcnt); 87 rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
128 rec.inobt.ir_free = cpu_to_be64(free); 88 rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
129 return xfs_btree_update(cur, &rec); 89 return xfs_btree_update(cur, &rec);
130} 90}
131 91
@@ -135,9 +95,7 @@ xfs_inobt_update(
135int /* error */ 95int /* error */
136xfs_inobt_get_rec( 96xfs_inobt_get_rec(
137 struct xfs_btree_cur *cur, /* btree cursor */ 97 struct xfs_btree_cur *cur, /* btree cursor */
138 xfs_agino_t *ino, /* output: starting inode of chunk */ 98 xfs_inobt_rec_incore_t *irec, /* btree record */
139 __int32_t *fcnt, /* output: number of free inodes */
140 xfs_inofree_t *free, /* output: free inode mask */
141 int *stat) /* output: success/failure */ 99 int *stat) /* output: success/failure */
142{ 100{
143 union xfs_btree_rec *rec; 101 union xfs_btree_rec *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
145 103
146 error = xfs_btree_get_rec(cur, &rec, stat); 104 error = xfs_btree_get_rec(cur, &rec, stat);
147 if (!error && *stat == 1) { 105 if (!error && *stat == 1) {
148 *ino = be32_to_cpu(rec->inobt.ir_startino); 106 irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
149 *fcnt = be32_to_cpu(rec->inobt.ir_freecount); 107 irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
150 *free = be64_to_cpu(rec->inobt.ir_free); 108 irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
151 } 109 }
152 return error; 110 return error;
153} 111}
154 112
155/* 113/*
114 * Verify that the number of free inodes in the AGI is correct.
115 */
116#ifdef DEBUG
117STATIC int
118xfs_check_agi_freecount(
119 struct xfs_btree_cur *cur,
120 struct xfs_agi *agi)
121{
122 if (cur->bc_nlevels == 1) {
123 xfs_inobt_rec_incore_t rec;
124 int freecount = 0;
125 int error;
126 int i;
127
128 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
129 if (error)
130 return error;
131
132 do {
133 error = xfs_inobt_get_rec(cur, &rec, &i);
134 if (error)
135 return error;
136
137 if (i) {
138 freecount += rec.ir_freecount;
139 error = xfs_btree_increment(cur, 0, &i);
140 if (error)
141 return error;
142 }
143 } while (i == 1);
144
145 if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
146 ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
147 }
148 return 0;
149}
150#else
151#define xfs_check_agi_freecount(cur, agi) 0
152#endif
153
154/*
155 * Initialise a new set of inodes.
156 */
157STATIC void
158xfs_ialloc_inode_init(
159 struct xfs_mount *mp,
160 struct xfs_trans *tp,
161 xfs_agnumber_t agno,
162 xfs_agblock_t agbno,
163 xfs_agblock_t length,
164 unsigned int gen)
165{
166 struct xfs_buf *fbuf;
167 struct xfs_dinode *free;
168 int blks_per_cluster, nbufs, ninodes;
169 int version;
170 int i, j;
171 xfs_daddr_t d;
172
173 /*
174 * Loop over the new block(s), filling in the inodes.
175 * For small block sizes, manipulate the inodes in buffers
176 * which are multiples of the blocks size.
177 */
178 if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
179 blks_per_cluster = 1;
180 nbufs = length;
181 ninodes = mp->m_sb.sb_inopblock;
182 } else {
183 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
184 mp->m_sb.sb_blocksize;
185 nbufs = length / blks_per_cluster;
186 ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
187 }
188
189 /*
190 * Figure out what version number to use in the inodes we create.
191 * If the superblock version has caught up to the one that supports
192 * the new inode format, then use the new inode version. Otherwise
193 * use the old version so that old kernels will continue to be
194 * able to use the file system.
195 */
196 if (xfs_sb_version_hasnlink(&mp->m_sb))
197 version = 2;
198 else
199 version = 1;
200
201 for (j = 0; j < nbufs; j++) {
202 /*
203 * Get the block.
204 */
205 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
206 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
207 mp->m_bsize * blks_per_cluster,
208 XFS_BUF_LOCK);
209 ASSERT(fbuf);
210 ASSERT(!XFS_BUF_GETERROR(fbuf));
211
212 /*
213 * Initialize all inodes in this buffer and then log them.
214 *
215 * XXX: It would be much better if we had just one transaction
216 * to log a whole cluster of inodes instead of all the
217 * individual transactions causing a lot of log traffic.
218 */
219 xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
220 for (i = 0; i < ninodes; i++) {
221 int ioffset = i << mp->m_sb.sb_inodelog;
222 uint isize = sizeof(struct xfs_dinode);
223
224 free = xfs_make_iptr(mp, fbuf, i);
225 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
226 free->di_version = version;
227 free->di_gen = cpu_to_be32(gen);
228 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
229 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
230 }
231 xfs_trans_inode_alloc_buf(tp, fbuf);
232 }
233}
234
235/*
156 * Allocate new inodes in the allocation group specified by agbp. 236 * Allocate new inodes in the allocation group specified by agbp.
157 * Return 0 for success, else error code. 237 * Return 0 for success, else error code.
158 */ 238 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
164{ 244{
165 xfs_agi_t *agi; /* allocation group header */ 245 xfs_agi_t *agi; /* allocation group header */
166 xfs_alloc_arg_t args; /* allocation argument structure */ 246 xfs_alloc_arg_t args; /* allocation argument structure */
167 int blks_per_cluster; /* fs blocks per inode cluster */
168 xfs_btree_cur_t *cur; /* inode btree cursor */ 247 xfs_btree_cur_t *cur; /* inode btree cursor */
169 xfs_daddr_t d; /* disk addr of buffer */
170 xfs_agnumber_t agno; 248 xfs_agnumber_t agno;
171 int error; 249 int error;
172 xfs_buf_t *fbuf; /* new free inodes' buffer */ 250 int i;
173 xfs_dinode_t *free; /* new free inode structure */
174 int i; /* inode counter */
175 int j; /* block counter */
176 int nbufs; /* num bufs of new inodes */
177 xfs_agino_t newino; /* new first inode's number */ 251 xfs_agino_t newino; /* new first inode's number */
178 xfs_agino_t newlen; /* new number of inodes */ 252 xfs_agino_t newlen; /* new number of inodes */
179 int ninodes; /* num inodes per buf */
180 xfs_agino_t thisino; /* current inode number, for loop */ 253 xfs_agino_t thisino; /* current inode number, for loop */
181 int version; /* inode version number to use */
182 int isaligned = 0; /* inode allocation at stripe unit */ 254 int isaligned = 0; /* inode allocation at stripe unit */
183 /* boundary */ 255 /* boundary */
184 unsigned int gen;
185 256
186 args.tp = tp; 257 args.tp = tp;
187 args.mp = tp->t_mountp; 258 args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
202 */ 273 */
203 agi = XFS_BUF_TO_AGI(agbp); 274 agi = XFS_BUF_TO_AGI(agbp);
204 newino = be32_to_cpu(agi->agi_newino); 275 newino = be32_to_cpu(agi->agi_newino);
276 agno = be32_to_cpu(agi->agi_seqno);
205 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + 277 args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
206 XFS_IALLOC_BLOCKS(args.mp); 278 XFS_IALLOC_BLOCKS(args.mp);
207 if (likely(newino != NULLAGINO && 279 if (likely(newino != NULLAGINO &&
208 (args.agbno < be32_to_cpu(agi->agi_length)))) { 280 (args.agbno < be32_to_cpu(agi->agi_length)))) {
209 args.fsbno = XFS_AGB_TO_FSB(args.mp, 281 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
210 be32_to_cpu(agi->agi_seqno), args.agbno);
211 args.type = XFS_ALLOCTYPE_THIS_BNO; 282 args.type = XFS_ALLOCTYPE_THIS_BNO;
212 args.mod = args.total = args.wasdel = args.isfl = 283 args.mod = args.total = args.wasdel = args.isfl =
213 args.userdata = args.minalignslop = 0; 284 args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
258 * For now, just allocate blocks up front. 329 * For now, just allocate blocks up front.
259 */ 330 */
260 args.agbno = be32_to_cpu(agi->agi_root); 331 args.agbno = be32_to_cpu(agi->agi_root);
261 args.fsbno = XFS_AGB_TO_FSB(args.mp, 332 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
262 be32_to_cpu(agi->agi_seqno), args.agbno);
263 /* 333 /*
264 * Allocate a fixed-size extent of inodes. 334 * Allocate a fixed-size extent of inodes.
265 */ 335 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
282 if (isaligned && args.fsbno == NULLFSBLOCK) { 352 if (isaligned && args.fsbno == NULLFSBLOCK) {
283 args.type = XFS_ALLOCTYPE_NEAR_BNO; 353 args.type = XFS_ALLOCTYPE_NEAR_BNO;
284 args.agbno = be32_to_cpu(agi->agi_root); 354 args.agbno = be32_to_cpu(agi->agi_root);
285 args.fsbno = XFS_AGB_TO_FSB(args.mp, 355 args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
286 be32_to_cpu(agi->agi_seqno), args.agbno);
287 args.alignment = xfs_ialloc_cluster_alignment(&args); 356 args.alignment = xfs_ialloc_cluster_alignment(&args);
288 if ((error = xfs_alloc_vextent(&args))) 357 if ((error = xfs_alloc_vextent(&args)))
289 return error; 358 return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
294 return 0; 363 return 0;
295 } 364 }
296 ASSERT(args.len == args.minlen); 365 ASSERT(args.len == args.minlen);
297 /*
298 * Convert the results.
299 */
300 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
301 /*
302 * Loop over the new block(s), filling in the inodes.
303 * For small block sizes, manipulate the inodes in buffers
304 * which are multiples of the blocks size.
305 */
306 if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
307 blks_per_cluster = 1;
308 nbufs = (int)args.len;
309 ninodes = args.mp->m_sb.sb_inopblock;
310 } else {
311 blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
312 args.mp->m_sb.sb_blocksize;
313 nbufs = (int)args.len / blks_per_cluster;
314 ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
315 }
316 /*
317 * Figure out what version number to use in the inodes we create.
318 * If the superblock version has caught up to the one that supports
319 * the new inode format, then use the new inode version. Otherwise
320 * use the old version so that old kernels will continue to be
321 * able to use the file system.
322 */
323 if (xfs_sb_version_hasnlink(&args.mp->m_sb))
324 version = 2;
325 else
326 version = 1;
327 366
328 /* 367 /*
368 * Stamp and write the inode buffers.
369 *
329 * Seed the new inode cluster with a random generation number. This 370 * Seed the new inode cluster with a random generation number. This
330 * prevents short-term reuse of generation numbers if a chunk is 371 * prevents short-term reuse of generation numbers if a chunk is
331 * freed and then immediately reallocated. We use random numbers 372 * freed and then immediately reallocated. We use random numbers
332 * rather than a linear progression to prevent the next generation 373 * rather than a linear progression to prevent the next generation
333 * number from being easily guessable. 374 * number from being easily guessable.
334 */ 375 */
335 gen = random32(); 376 xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
336 for (j = 0; j < nbufs; j++) { 377 random32());
337 /*
338 * Get the block.
339 */
340 d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
341 args.agbno + (j * blks_per_cluster));
342 fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
343 args.mp->m_bsize * blks_per_cluster,
344 XFS_BUF_LOCK);
345 ASSERT(fbuf);
346 ASSERT(!XFS_BUF_GETERROR(fbuf));
347 378
348 /* 379 /*
349 * Initialize all inodes in this buffer and then log them. 380 * Convert the results.
350 * 381 */
351 * XXX: It would be much better if we had just one transaction to 382 newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
352 * log a whole cluster of inodes instead of all the individual
353 * transactions causing a lot of log traffic.
354 */
355 xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
356 for (i = 0; i < ninodes; i++) {
357 int ioffset = i << args.mp->m_sb.sb_inodelog;
358 uint isize = sizeof(struct xfs_dinode);
359
360 free = xfs_make_iptr(args.mp, fbuf, i);
361 free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
362 free->di_version = version;
363 free->di_gen = cpu_to_be32(gen);
364 free->di_next_unlinked = cpu_to_be32(NULLAGINO);
365 xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
366 }
367 xfs_trans_inode_alloc_buf(tp, fbuf);
368 }
369 be32_add_cpu(&agi->agi_count, newlen); 383 be32_add_cpu(&agi->agi_count, newlen);
370 be32_add_cpu(&agi->agi_freecount, newlen); 384 be32_add_cpu(&agi->agi_freecount, newlen);
371 agno = be32_to_cpu(agi->agi_seqno);
372 down_read(&args.mp->m_peraglock); 385 down_read(&args.mp->m_peraglock);
373 args.mp->m_perag[agno].pagi_freecount += newlen; 386 args.mp->m_perag[agno].pagi_freecount += newlen;
374 up_read(&args.mp->m_peraglock); 387 up_read(&args.mp->m_peraglock);
375 agi->agi_newino = cpu_to_be32(newino); 388 agi->agi_newino = cpu_to_be32(newino);
389
376 /* 390 /*
377 * Insert records describing the new inode chunk into the btree. 391 * Insert records describing the new inode chunk into the btree.
378 */ 392 */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
380 for (thisino = newino; 394 for (thisino = newino;
381 thisino < newino + newlen; 395 thisino < newino + newlen;
382 thisino += XFS_INODES_PER_CHUNK) { 396 thisino += XFS_INODES_PER_CHUNK) {
383 if ((error = xfs_inobt_lookup_eq(cur, thisino, 397 cur->bc_rec.i.ir_startino = thisino;
384 XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) { 398 cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
399 cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
400 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
401 if (error) {
385 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 402 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
386 return error; 403 return error;
387 } 404 }
388 ASSERT(i == 0); 405 ASSERT(i == 0);
389 if ((error = xfs_btree_insert(cur, &i))) { 406 error = xfs_btree_insert(cur, &i);
407 if (error) {
390 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 408 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
391 return error; 409 return error;
392 } 410 }
@@ -539,6 +557,62 @@ nextag:
539} 557}
540 558
541/* 559/*
560 * Try to retrieve the next record to the left/right from the current one.
561 */
562STATIC int
563xfs_ialloc_next_rec(
564 struct xfs_btree_cur *cur,
565 xfs_inobt_rec_incore_t *rec,
566 int *done,
567 int left)
568{
569 int error;
570 int i;
571
572 if (left)
573 error = xfs_btree_decrement(cur, 0, &i);
574 else
575 error = xfs_btree_increment(cur, 0, &i);
576
577 if (error)
578 return error;
579 *done = !i;
580 if (i) {
581 error = xfs_inobt_get_rec(cur, rec, &i);
582 if (error)
583 return error;
584 XFS_WANT_CORRUPTED_RETURN(i == 1);
585 }
586
587 return 0;
588}
589
590STATIC int
591xfs_ialloc_get_rec(
592 struct xfs_btree_cur *cur,
593 xfs_agino_t agino,
594 xfs_inobt_rec_incore_t *rec,
595 int *done,
596 int left)
597{
598 int error;
599 int i;
600
601 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
602 if (error)
603 return error;
604 *done = !i;
605 if (i) {
606 error = xfs_inobt_get_rec(cur, rec, &i);
607 if (error)
608 return error;
609 XFS_WANT_CORRUPTED_RETURN(i == 1);
610 }
611
612 return 0;
613}
614
615/*
542 * Visible inode allocation functions. 616 * Visible inode allocation functions.
543 */ 617 */
544 618
@@ -592,8 +666,8 @@ xfs_dialloc(
592 int j; /* result code */ 666 int j; /* result code */
593 xfs_mount_t *mp; /* file system mount structure */ 667 xfs_mount_t *mp; /* file system mount structure */
594 int offset; /* index of inode in chunk */ 668 int offset; /* index of inode in chunk */
595 xfs_agino_t pagino; /* parent's a.g. relative inode # */ 669 xfs_agino_t pagino; /* parent's AG relative inode # */
596 xfs_agnumber_t pagno; /* parent's allocation group number */ 670 xfs_agnumber_t pagno; /* parent's AG number */
597 xfs_inobt_rec_incore_t rec; /* inode allocation record */ 671 xfs_inobt_rec_incore_t rec; /* inode allocation record */
598 xfs_agnumber_t tagno; /* testing allocation group number */ 672 xfs_agnumber_t tagno; /* testing allocation group number */
599 xfs_btree_cur_t *tcur; /* temp cursor */ 673 xfs_btree_cur_t *tcur; /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
716 */ 790 */
717 agno = tagno; 791 agno = tagno;
718 *IO_agbp = NULL; 792 *IO_agbp = NULL;
793
794 restart_pagno:
719 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno)); 795 cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
720 /* 796 /*
721 * If pagino is 0 (this is the root inode allocation) use newino. 797 * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
723 */ 799 */
724 if (!pagino) 800 if (!pagino)
725 pagino = be32_to_cpu(agi->agi_newino); 801 pagino = be32_to_cpu(agi->agi_newino);
726#ifdef DEBUG
727 if (cur->bc_nlevels == 1) {
728 int freecount = 0;
729 802
730 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 803 error = xfs_check_agi_freecount(cur, agi);
731 goto error0; 804 if (error)
732 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 805 goto error0;
733 do {
734 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
735 &rec.ir_freecount, &rec.ir_free, &i)))
736 goto error0;
737 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
738 freecount += rec.ir_freecount;
739 if ((error = xfs_btree_increment(cur, 0, &i)))
740 goto error0;
741 } while (i == 1);
742 806
743 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
744 XFS_FORCED_SHUTDOWN(mp));
745 }
746#endif
747 /* 807 /*
748 * If in the same a.g. as the parent, try to get near the parent. 808 * If in the same AG as the parent, try to get near the parent.
749 */ 809 */
750 if (pagno == agno) { 810 if (pagno == agno) {
751 if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i))) 811 xfs_perag_t *pag = &mp->m_perag[agno];
812 int doneleft; /* done, to the left */
813 int doneright; /* done, to the right */
814 int searchdistance = 10;
815
816 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
817 if (error)
818 goto error0;
819 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
820
821 error = xfs_inobt_get_rec(cur, &rec, &j);
822 if (error)
752 goto error0; 823 goto error0;
753 if (i != 0 && 824 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
754 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 825
755 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 826 if (rec.ir_freecount > 0) {
756 j == 1 &&
757 rec.ir_freecount > 0) {
758 /* 827 /*
759 * Found a free inode in the same chunk 828 * Found a free inode in the same chunk
760 * as parent, done. 829 * as the parent, done.
761 */ 830 */
831 goto alloc_inode;
762 } 832 }
833
834
835 /*
836 * In the same AG as parent, but parent's chunk is full.
837 */
838
839 /* duplicate the cursor, search left & right simultaneously */
840 error = xfs_btree_dup_cursor(cur, &tcur);
841 if (error)
842 goto error0;
843
763 /* 844 /*
764 * In the same a.g. as parent, but parent's chunk is full. 845 * Skip to last blocks looked up if same parent inode.
765 */ 846 */
766 else { 847 if (pagino != NULLAGINO &&
767 int doneleft; /* done, to the left */ 848 pag->pagl_pagino == pagino &&
768 int doneright; /* done, to the right */ 849 pag->pagl_leftrec != NULLAGINO &&
850 pag->pagl_rightrec != NULLAGINO) {
851 error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
852 &trec, &doneleft, 1);
853 if (error)
854 goto error1;
769 855
856 error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
857 &rec, &doneright, 0);
770 if (error) 858 if (error)
771 goto error0;
772 ASSERT(i == 1);
773 ASSERT(j == 1);
774 /*
775 * Duplicate the cursor, search left & right
776 * simultaneously.
777 */
778 if ((error = xfs_btree_dup_cursor(cur, &tcur)))
779 goto error0;
780 /*
781 * Search left with tcur, back up 1 record.
782 */
783 if ((error = xfs_btree_decrement(tcur, 0, &i)))
784 goto error1; 859 goto error1;
785 doneleft = !i; 860 } else {
786 if (!doneleft) { 861 /* search left with tcur, back up 1 record */
787 if ((error = xfs_inobt_get_rec(tcur, 862 error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
788 &trec.ir_startino, 863 if (error)
789 &trec.ir_freecount,
790 &trec.ir_free, &i)))
791 goto error1;
792 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
793 }
794 /*
795 * Search right with cur, go forward 1 record.
796 */
797 if ((error = xfs_btree_increment(cur, 0, &i)))
798 goto error1; 864 goto error1;
799 doneright = !i;
800 if (!doneright) {
801 if ((error = xfs_inobt_get_rec(cur,
802 &rec.ir_startino,
803 &rec.ir_freecount,
804 &rec.ir_free, &i)))
805 goto error1;
806 XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
807 }
808 /*
809 * Loop until we find the closest inode chunk
810 * with a free one.
811 */
812 while (!doneleft || !doneright) {
813 int useleft; /* using left inode
814 chunk this time */
815 865
866 /* search right with cur, go forward 1 record. */
867 error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
868 if (error)
869 goto error1;
870 }
871
872 /*
873 * Loop until we find an inode chunk with a free inode.
874 */
875 while (!doneleft || !doneright) {
876 int useleft; /* using left inode chunk this time */
877
878 if (!--searchdistance) {
816 /* 879 /*
817 * Figure out which block is closer, 880 * Not in range - save last search
818 * if both are valid. 881 * location and allocate a new inode
819 */
820 if (!doneleft && !doneright)
821 useleft =
822 pagino -
823 (trec.ir_startino +
824 XFS_INODES_PER_CHUNK - 1) <
825 rec.ir_startino - pagino;
826 else
827 useleft = !doneleft;
828 /*
829 * If checking the left, does it have
830 * free inodes?
831 */
832 if (useleft && trec.ir_freecount) {
833 /*
834 * Yes, set it up as the chunk to use.
835 */
836 rec = trec;
837 xfs_btree_del_cursor(cur,
838 XFS_BTREE_NOERROR);
839 cur = tcur;
840 break;
841 }
842 /*
843 * If checking the right, does it have
844 * free inodes?
845 */
846 if (!useleft && rec.ir_freecount) {
847 /*
848 * Yes, it's already set up.
849 */
850 xfs_btree_del_cursor(tcur,
851 XFS_BTREE_NOERROR);
852 break;
853 }
854 /*
855 * If used the left, get another one
856 * further left.
857 */
858 if (useleft) {
859 if ((error = xfs_btree_decrement(tcur, 0,
860 &i)))
861 goto error1;
862 doneleft = !i;
863 if (!doneleft) {
864 if ((error = xfs_inobt_get_rec(
865 tcur,
866 &trec.ir_startino,
867 &trec.ir_freecount,
868 &trec.ir_free, &i)))
869 goto error1;
870 XFS_WANT_CORRUPTED_GOTO(i == 1,
871 error1);
872 }
873 }
874 /*
875 * If used the right, get another one
876 * further right.
877 */ 882 */
878 else { 883 pag->pagl_leftrec = trec.ir_startino;
879 if ((error = xfs_btree_increment(cur, 0, 884 pag->pagl_rightrec = rec.ir_startino;
880 &i))) 885 pag->pagl_pagino = pagino;
881 goto error1; 886 goto newino;
882 doneright = !i; 887 }
883 if (!doneright) { 888
884 if ((error = xfs_inobt_get_rec( 889 /* figure out the closer block if both are valid. */
885 cur, 890 if (!doneleft && !doneright) {
886 &rec.ir_startino, 891 useleft = pagino -
887 &rec.ir_freecount, 892 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
888 &rec.ir_free, &i))) 893 rec.ir_startino - pagino;
889 goto error1; 894 } else {
890 XFS_WANT_CORRUPTED_GOTO(i == 1, 895 useleft = !doneleft;
891 error1);
892 }
893 }
894 } 896 }
895 ASSERT(!doneleft || !doneright); 897
898 /* free inodes to the left? */
899 if (useleft && trec.ir_freecount) {
900 rec = trec;
901 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
902 cur = tcur;
903
904 pag->pagl_leftrec = trec.ir_startino;
905 pag->pagl_rightrec = rec.ir_startino;
906 pag->pagl_pagino = pagino;
907 goto alloc_inode;
908 }
909
910 /* free inodes to the right? */
911 if (!useleft && rec.ir_freecount) {
912 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
913
914 pag->pagl_leftrec = trec.ir_startino;
915 pag->pagl_rightrec = rec.ir_startino;
916 pag->pagl_pagino = pagino;
917 goto alloc_inode;
918 }
919
920 /* get next record to check */
921 if (useleft) {
922 error = xfs_ialloc_next_rec(tcur, &trec,
923 &doneleft, 1);
924 } else {
925 error = xfs_ialloc_next_rec(cur, &rec,
926 &doneright, 0);
927 }
928 if (error)
929 goto error1;
896 } 930 }
931
932 /*
933 * We've reached the end of the btree. because
934 * we are only searching a small chunk of the
935 * btree each search, there is obviously free
936 * inodes closer to the parent inode than we
937 * are now. restart the search again.
938 */
939 pag->pagl_pagino = NULLAGINO;
940 pag->pagl_leftrec = NULLAGINO;
941 pag->pagl_rightrec = NULLAGINO;
942 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
943 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
944 goto restart_pagno;
897 } 945 }
946
898 /* 947 /*
899 * In a different a.g. from the parent. 948 * In a different AG from the parent.
900 * See if the most recently allocated block has any free. 949 * See if the most recently allocated block has any free.
901 */ 950 */
902 else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) { 951newino:
903 if ((error = xfs_inobt_lookup_eq(cur, 952 if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
904 be32_to_cpu(agi->agi_newino), 0, 0, &i))) 953 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
954 XFS_LOOKUP_EQ, &i);
955 if (error)
905 goto error0; 956 goto error0;
906 if (i == 1 && 957
907 (error = xfs_inobt_get_rec(cur, &rec.ir_startino, 958 if (i == 1) {
908 &rec.ir_freecount, &rec.ir_free, &j)) == 0 && 959 error = xfs_inobt_get_rec(cur, &rec, &j);
909 j == 1 &&
910 rec.ir_freecount > 0) {
911 /*
912 * The last chunk allocated in the group still has
913 * a free inode.
914 */
915 }
916 /*
917 * None left in the last group, search the whole a.g.
918 */
919 else {
920 if (error) 960 if (error)
921 goto error0; 961 goto error0;
922 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 962
923 goto error0; 963 if (j == 1 && rec.ir_freecount > 0) {
924 ASSERT(i == 1); 964 /*
925 for (;;) { 965 * The last chunk allocated in the group
926 if ((error = xfs_inobt_get_rec(cur, 966 * still has a free inode.
927 &rec.ir_startino, 967 */
928 &rec.ir_freecount, &rec.ir_free, 968 goto alloc_inode;
929 &i)))
930 goto error0;
931 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
932 if (rec.ir_freecount > 0)
933 break;
934 if ((error = xfs_btree_increment(cur, 0, &i)))
935 goto error0;
936 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
937 } 969 }
938 } 970 }
939 } 971 }
972
973 /*
974 * None left in the last group, search the whole AG
975 */
976 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
977 if (error)
978 goto error0;
979 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
980
981 for (;;) {
982 error = xfs_inobt_get_rec(cur, &rec, &i);
983 if (error)
984 goto error0;
985 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
986 if (rec.ir_freecount > 0)
987 break;
988 error = xfs_btree_increment(cur, 0, &i);
989 if (error)
990 goto error0;
991 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
992 }
993
994alloc_inode:
940 offset = xfs_ialloc_find_free(&rec.ir_free); 995 offset = xfs_ialloc_find_free(&rec.ir_free);
941 ASSERT(offset >= 0); 996 ASSERT(offset >= 0);
942 ASSERT(offset < XFS_INODES_PER_CHUNK); 997 ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
945 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset); 1000 ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
946 rec.ir_free &= ~XFS_INOBT_MASK(offset); 1001 rec.ir_free &= ~XFS_INOBT_MASK(offset);
947 rec.ir_freecount--; 1002 rec.ir_freecount--;
948 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, 1003 error = xfs_inobt_update(cur, &rec);
949 rec.ir_free))) 1004 if (error)
950 goto error0; 1005 goto error0;
951 be32_add_cpu(&agi->agi_freecount, -1); 1006 be32_add_cpu(&agi->agi_freecount, -1);
952 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); 1007 xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
953 down_read(&mp->m_peraglock); 1008 down_read(&mp->m_peraglock);
954 mp->m_perag[tagno].pagi_freecount--; 1009 mp->m_perag[tagno].pagi_freecount--;
955 up_read(&mp->m_peraglock); 1010 up_read(&mp->m_peraglock);
956#ifdef DEBUG
957 if (cur->bc_nlevels == 1) {
958 int freecount = 0;
959 1011
960 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1012 error = xfs_check_agi_freecount(cur, agi);
961 goto error0; 1013 if (error)
962 do { 1014 goto error0;
963 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1015
964 &rec.ir_freecount, &rec.ir_free, &i)))
965 goto error0;
966 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
967 freecount += rec.ir_freecount;
968 if ((error = xfs_btree_increment(cur, 0, &i)))
969 goto error0;
970 } while (i == 1);
971 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
972 XFS_FORCED_SHUTDOWN(mp));
973 }
974#endif
975 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1016 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
976 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1); 1017 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
977 *inop = ino; 1018 *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
1062 * Initialize the cursor. 1103 * Initialize the cursor.
1063 */ 1104 */
1064 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1105 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1065#ifdef DEBUG
1066 if (cur->bc_nlevels == 1) {
1067 int freecount = 0;
1068 1106
1069 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) 1107 error = xfs_check_agi_freecount(cur, agi);
1070 goto error0; 1108 if (error)
1071 do { 1109 goto error0;
1072 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, 1110
1073 &rec.ir_freecount, &rec.ir_free, &i)))
1074 goto error0;
1075 if (i) {
1076 freecount += rec.ir_freecount;
1077 if ((error = xfs_btree_increment(cur, 0, &i)))
1078 goto error0;
1079 }
1080 } while (i == 1);
1081 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1082 XFS_FORCED_SHUTDOWN(mp));
1083 }
1084#endif
1085 /* 1111 /*
1086 * Look for the entry describing this inode. 1112 * Look for the entry describing this inode.
1087 */ 1113 */
1088 if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) { 1114 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1089 cmn_err(CE_WARN, 1115 cmn_err(CE_WARN,
1090 "xfs_difree: xfs_inobt_lookup_le returned() an error %d on %s. Returning error.", 1116 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.",
1091 error, mp->m_fsname); 1117 error, mp->m_fsname);
1092 goto error0; 1118 goto error0;
1093 } 1119 }
1094 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1120 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1095 if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, 1121 error = xfs_inobt_get_rec(cur, &rec, &i);
1096 &rec.ir_free, &i))) { 1122 if (error) {
1097 cmn_err(CE_WARN, 1123 cmn_err(CE_WARN,
1098 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1124 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.",
1099 error, mp->m_fsname); 1125 error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
1148 } else { 1174 } else {
1149 *delete = 0; 1175 *delete = 0;
1150 1176
1151 if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { 1177 error = xfs_inobt_update(cur, &rec);
1178 if (error) {
1152 cmn_err(CE_WARN, 1179 cmn_err(CE_WARN,
1153 "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", 1180 "xfs_difree: xfs_inobt_update returned an error %d on %s.",
1154 error, mp->m_fsname); 1181 error, mp->m_fsname);
1155 goto error0; 1182 goto error0;
1156 } 1183 }
1184
1157 /* 1185 /*
1158 * Change the inode free counts and log the ag/sb changes. 1186 * Change the inode free counts and log the ag/sb changes.
1159 */ 1187 */
@@ -1165,28 +1193,10 @@ xfs_difree(
1165 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); 1193 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1166 } 1194 }
1167 1195
1168#ifdef DEBUG 1196 error = xfs_check_agi_freecount(cur, agi);
1169 if (cur->bc_nlevels == 1) { 1197 if (error)
1170 int freecount = 0; 1198 goto error0;
1171 1199
1172 if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
1173 goto error0;
1174 do {
1175 if ((error = xfs_inobt_get_rec(cur,
1176 &rec.ir_startino,
1177 &rec.ir_freecount,
1178 &rec.ir_free, &i)))
1179 goto error0;
1180 if (i) {
1181 freecount += rec.ir_freecount;
1182 if ((error = xfs_btree_increment(cur, 0, &i)))
1183 goto error0;
1184 }
1185 } while (i == 1);
1186 ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
1187 XFS_FORCED_SHUTDOWN(mp));
1188 }
1189#endif
1190 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1200 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1191 return 0; 1201 return 0;
1192 1202
@@ -1297,9 +1307,7 @@ xfs_imap(
1297 chunk_agbno = agbno - offset_agbno; 1307 chunk_agbno = agbno - offset_agbno;
1298 } else { 1308 } else {
1299 xfs_btree_cur_t *cur; /* inode btree cursor */ 1309 xfs_btree_cur_t *cur; /* inode btree cursor */
1300 xfs_agino_t chunk_agino; /* first agino in inode chunk */ 1310 xfs_inobt_rec_incore_t chunk_rec;
1301 __int32_t chunk_cnt; /* count of free inodes in chunk */
1302 xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
1303 xfs_buf_t *agbp; /* agi buffer */ 1311 xfs_buf_t *agbp; /* agi buffer */
1304 int i; /* temp state */ 1312 int i; /* temp state */
1305 1313
@@ -1315,15 +1323,14 @@ xfs_imap(
1315 } 1323 }
1316 1324
1317 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); 1325 cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1318 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i); 1326 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1319 if (error) { 1327 if (error) {
1320 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1321 "xfs_inobt_lookup_le() failed"); 1329 "xfs_inobt_lookup() failed");
1322 goto error0; 1330 goto error0;
1323 } 1331 }
1324 1332
1325 error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt, 1333 error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
1326 &chunk_free, &i);
1327 if (error) { 1334 if (error) {
1328 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1335 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
1329 "xfs_inobt_get_rec() failed"); 1336 "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
1341 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1348 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1342 if (error) 1349 if (error)
1343 return error; 1350 return error;
1344 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino); 1351 chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
1345 offset_agbno = agbno - chunk_agbno; 1352 offset_agbno = agbno - chunk_agbno;
1346 } 1353 }
1347 1354
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
150 xfs_agnumber_t agno); /* allocation group number */ 150 xfs_agnumber_t agno); /* allocation group number */
151 151
152/* 152/*
153 * Lookup the first record greater than or equal to ino 153 * Lookup a record by ino in the btree given by cur.
154 * in the btree given by cur.
155 */ 154 */
156int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino, 155int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
157 __int32_t fcnt, xfs_inofree_t free, int *stat); 156 xfs_lookup_t dir, int *stat);
158
159/*
160 * Lookup the first record less than or equal to ino
161 * in the btree given by cur.
162 */
163int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
164 __int32_t fcnt, xfs_inofree_t free, int *stat);
165 157
166/* 158/*
167 * Get the data from the pointed-to record. 159 * Get the data from the pointed-to record.
168 */ 160 */
169extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino, 161extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
170 __int32_t *fcnt, xfs_inofree_t *free, int *stat); 162 xfs_inobt_rec_incore_t *rec, int *stat);
171 163
172#endif /* __XFS_IALLOC_H__ */ 164#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index ecbf8b4d2e2e..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -82,7 +82,6 @@ xfs_inode_alloc(
82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
83 ip->i_flags = 0; 83 ip->i_flags = 0;
84 ip->i_update_core = 0; 84 ip->i_update_core = 0;
85 ip->i_update_size = 0;
86 ip->i_delayed_blks = 0; 85 ip->i_delayed_blks = 0;
87 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 86 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
88 ip->i_size = 0; 87 ip->i_size = 0;
@@ -456,32 +455,6 @@ out_error_or_again:
456 return error; 455 return error;
457} 456}
458 457
459
460/*
461 * Look for the inode corresponding to the given ino in the hash table.
462 * If it is there and its i_transp pointer matches tp, return it.
463 * Otherwise, return NULL.
464 */
465xfs_inode_t *
466xfs_inode_incore(xfs_mount_t *mp,
467 xfs_ino_t ino,
468 xfs_trans_t *tp)
469{
470 xfs_inode_t *ip;
471 xfs_perag_t *pag;
472
473 pag = xfs_get_perag(mp, ino);
474 read_lock(&pag->pag_ici_lock);
475 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
476 read_unlock(&pag->pag_ici_lock);
477 xfs_put_perag(mp, pag);
478
479 /* the returned inode must match the transaction */
480 if (ip && (ip->i_transp != tp))
481 return NULL;
482 return ip;
483}
484
485/* 458/*
486 * Decrement reference count of an inode structure and unlock it. 459 * Decrement reference count of an inode structure and unlock it.
487 * 460 *
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index da428b3fe0f5..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -651,7 +651,7 @@ xfs_iformat_btree(
651 return 0; 651 return 0;
652} 652}
653 653
654void 654STATIC void
655xfs_dinode_from_disk( 655xfs_dinode_from_disk(
656 xfs_icdinode_t *to, 656 xfs_icdinode_t *to,
657 xfs_dinode_t *from) 657 xfs_dinode_t *from)
@@ -1247,7 +1247,7 @@ xfs_isize_check(
1247 * In that case the pages will still be in memory, but the inode size 1247 * In that case the pages will still be in memory, but the inode size
1248 * will never have been updated. 1248 * will never have been updated.
1249 */ 1249 */
1250xfs_fsize_t 1250STATIC xfs_fsize_t
1251xfs_file_last_byte( 1251xfs_file_last_byte(
1252 xfs_inode_t *ip) 1252 xfs_inode_t *ip)
1253{ 1253{
@@ -3837,7 +3837,7 @@ xfs_iext_inline_to_direct(
3837/* 3837/*
3838 * Resize an extent indirection array to new_size bytes. 3838 * Resize an extent indirection array to new_size bytes.
3839 */ 3839 */
3840void 3840STATIC void
3841xfs_iext_realloc_indirect( 3841xfs_iext_realloc_indirect(
3842 xfs_ifork_t *ifp, /* inode fork pointer */ 3842 xfs_ifork_t *ifp, /* inode fork pointer */
3843 int new_size) /* new indirection array size */ 3843 int new_size) /* new indirection array size */
@@ -3862,7 +3862,7 @@ xfs_iext_realloc_indirect(
3862/* 3862/*
3863 * Switch from indirection array to linear (direct) extent allocations. 3863 * Switch from indirection array to linear (direct) extent allocations.
3864 */ 3864 */
3865void 3865STATIC void
3866xfs_iext_indirect_to_direct( 3866xfs_iext_indirect_to_direct(
3867 xfs_ifork_t *ifp) /* inode fork pointer */ 3867 xfs_ifork_t *ifp) /* inode fork pointer */
3868{ 3868{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 65f24a3cc992..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
261 /* Miscellaneous state. */ 261 /* Miscellaneous state. */
262 unsigned short i_flags; /* see defined flags below */ 262 unsigned short i_flags; /* see defined flags below */
263 unsigned char i_update_core; /* timestamps/size is dirty */ 263 unsigned char i_update_core; /* timestamps/size is dirty */
264 unsigned char i_update_size; /* di_size field is dirty */
265 unsigned int i_delayed_blks; /* count of delay alloc blks */ 264 unsigned int i_delayed_blks; /* count of delay alloc blks */
266 265
267 xfs_icdinode_t i_d; /* most of ondisk inode */ 266 xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -468,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
468/* 467/*
469 * xfs_iget.c prototypes. 468 * xfs_iget.c prototypes.
470 */ 469 */
471xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
472 struct xfs_trans *);
473int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 470int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
474 uint, uint, xfs_inode_t **, xfs_daddr_t); 471 uint, uint, xfs_inode_t **, xfs_daddr_t);
475void xfs_iput(xfs_inode_t *, uint); 472void xfs_iput(xfs_inode_t *, uint);
@@ -504,7 +501,6 @@ void xfs_ipin(xfs_inode_t *);
504void xfs_iunpin(xfs_inode_t *); 501void xfs_iunpin(xfs_inode_t *);
505int xfs_iflush(xfs_inode_t *, uint); 502int xfs_iflush(xfs_inode_t *, uint);
506void xfs_ichgtime(xfs_inode_t *, int); 503void xfs_ichgtime(xfs_inode_t *, int);
507xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
508void xfs_lock_inodes(xfs_inode_t **, int, uint); 504void xfs_lock_inodes(xfs_inode_t **, int, uint);
509void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 505void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
510 506
@@ -572,8 +568,6 @@ int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
572 struct xfs_buf **, uint); 568 struct xfs_buf **, uint);
573int xfs_iread(struct xfs_mount *, struct xfs_trans *, 569int xfs_iread(struct xfs_mount *, struct xfs_trans *,
574 struct xfs_inode *, xfs_daddr_t, uint); 570 struct xfs_inode *, xfs_daddr_t, uint);
575void xfs_dinode_from_disk(struct xfs_icdinode *,
576 struct xfs_dinode *);
577void xfs_dinode_to_disk(struct xfs_dinode *, 571void xfs_dinode_to_disk(struct xfs_dinode *,
578 struct xfs_icdinode *); 572 struct xfs_icdinode *);
579void xfs_idestroy_fork(struct xfs_inode *, int); 573void xfs_idestroy_fork(struct xfs_inode *, int);
@@ -592,8 +586,6 @@ void xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
592void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int); 586void xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
593void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int); 587void xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
594void xfs_iext_realloc_direct(xfs_ifork_t *, int); 588void xfs_iext_realloc_direct(xfs_ifork_t *, int);
595void xfs_iext_realloc_indirect(xfs_ifork_t *, int);
596void xfs_iext_indirect_to_direct(xfs_ifork_t *);
597void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t); 589void xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
598void xfs_iext_inline_to_direct(xfs_ifork_t *, int); 590void xfs_iext_inline_to_direct(xfs_ifork_t *, int);
599void xfs_iext_destroy(xfs_ifork_t *); 591void xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
263 } 263 }
264 264
265 /* 265 /*
266 * We don't have to worry about re-ordering here because
267 * the update_size field is protected by the inode lock
268 * and we have that held in exclusive mode.
269 */
270 if (ip->i_update_size)
271 ip->i_update_size = 0;
272
273 /*
274 * Make sure to get the latest atime from the Linux inode. 266 * Make sure to get the latest atime from the Linux inode.
275 */ 267 */
276 xfs_synchronize_atime(ip); 268 xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
712 * Clear out the fields of the inode log item particular 704 * Clear out the fields of the inode log item particular
713 * to the current transaction. 705 * to the current transaction.
714 */ 706 */
715 iip->ili_ilock_recur = 0;
716 iip->ili_iolock_recur = 0;
717 iip->ili_flags = 0; 707 iip->ili_flags = 0;
718 708
719 /* 709 /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
137 struct xfs_inode *ili_inode; /* inode ptr */ 137 struct xfs_inode *ili_inode; /* inode ptr */
138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */ 138 xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */ 139 xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
140 unsigned short ili_ilock_recur; /* lock recursion count */
141 unsigned short ili_iolock_recur; /* lock recursion count */
142 unsigned short ili_flags; /* misc flags */ 140 unsigned short ili_flags; /* misc flags */
143 unsigned short ili_logged; /* flushed logged data */ 141 unsigned short ili_logged; /* flushed logged data */
144 unsigned int ili_last_fields; /* fields when flushed */ 142 unsigned int ili_last_fields; /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
72 72
73#if XFS_BIG_INUMS 73#if XFS_BIG_INUMS
74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) 74#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
75#define XFS_INO64_OFFSET ((xfs_ino_t)(1ULL << 32))
76#else 75#else
77#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL)) 76#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
78#endif 77#endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
39#include "xfs_error.h" 39#include "xfs_error.h"
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41 41
42int 42STATIC int
43xfs_internal_inum( 43xfs_internal_inum(
44 xfs_mount_t *mp, 44 xfs_mount_t *mp,
45 xfs_ino_t ino) 45 xfs_ino_t ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
353 int end_of_ag; /* set if we've seen the ag end */ 353 int end_of_ag; /* set if we've seen the ag end */
354 int error; /* error code */ 354 int error; /* error code */
355 int fmterror;/* bulkstat formatter result */ 355 int fmterror;/* bulkstat formatter result */
356 __int32_t gcnt; /* current btree rec's count */
357 xfs_inofree_t gfree; /* current btree rec's free mask */
358 xfs_agino_t gino; /* current btree rec's start inode */
359 int i; /* loop index */ 356 int i; /* loop index */
360 int icount; /* count of inodes good in irbuf */ 357 int icount; /* count of inodes good in irbuf */
361 size_t irbsize; /* size of irec buffer in bytes */ 358 size_t irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
442 * we need to get the remainder of the chunk we're in. 439 * we need to get the remainder of the chunk we're in.
443 */ 440 */
444 if (agino > 0) { 441 if (agino > 0) {
442 xfs_inobt_rec_incore_t r;
443
445 /* 444 /*
446 * Lookup the inode chunk that this inode lives in. 445 * Lookup the inode chunk that this inode lives in.
447 */ 446 */
448 error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp); 447 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
448 &tmp);
449 if (!error && /* no I/O error */ 449 if (!error && /* no I/O error */
450 tmp && /* lookup succeeded */ 450 tmp && /* lookup succeeded */
451 /* got the record, should always work */ 451 /* got the record, should always work */
452 !(error = xfs_inobt_get_rec(cur, &gino, &gcnt, 452 !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
453 &gfree, &i)) &&
454 i == 1 && 453 i == 1 &&
455 /* this is the right chunk */ 454 /* this is the right chunk */
456 agino < gino + XFS_INODES_PER_CHUNK && 455 agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
457 /* lastino was not last in chunk */ 456 /* lastino was not last in chunk */
458 (chunkidx = agino - gino + 1) < 457 (chunkidx = agino - r.ir_startino + 1) <
459 XFS_INODES_PER_CHUNK && 458 XFS_INODES_PER_CHUNK &&
460 /* there are some left allocated */ 459 /* there are some left allocated */
461 xfs_inobt_maskn(chunkidx, 460 xfs_inobt_maskn(chunkidx,
462 XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) { 461 XFS_INODES_PER_CHUNK - chunkidx) &
462 ~r.ir_free) {
463 /* 463 /*
464 * Grab the chunk record. Mark all the 464 * Grab the chunk record. Mark all the
465 * uninteresting inodes (because they're 465 * uninteresting inodes (because they're
466 * before our start point) free. 466 * before our start point) free.
467 */ 467 */
468 for (i = 0; i < chunkidx; i++) { 468 for (i = 0; i < chunkidx; i++) {
469 if (XFS_INOBT_MASK(i) & ~gfree) 469 if (XFS_INOBT_MASK(i) & ~r.ir_free)
470 gcnt++; 470 r.ir_freecount++;
471 } 471 }
472 gfree |= xfs_inobt_maskn(0, chunkidx); 472 r.ir_free |= xfs_inobt_maskn(0, chunkidx);
473 irbp->ir_startino = gino; 473 irbp->ir_startino = r.ir_startino;
474 irbp->ir_freecount = gcnt; 474 irbp->ir_freecount = r.ir_freecount;
475 irbp->ir_free = gfree; 475 irbp->ir_free = r.ir_free;
476 irbp++; 476 irbp++;
477 agino = gino + XFS_INODES_PER_CHUNK; 477 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
478 icount = XFS_INODES_PER_CHUNK - gcnt; 478 icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
479 } else { 479 } else {
480 /* 480 /*
481 * If any of those tests failed, bump the 481 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
493 /* 493 /*
494 * Start of ag. Lookup the first inode chunk. 494 * Start of ag. Lookup the first inode chunk.
495 */ 495 */
496 error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp); 496 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
497 icount = 0; 497 icount = 0;
498 } 498 }
499 /* 499 /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
501 * until we run out of inodes or space in the buffer. 501 * until we run out of inodes or space in the buffer.
502 */ 502 */
503 while (irbp < irbufend && icount < ubcount) { 503 while (irbp < irbufend && icount < ubcount) {
504 xfs_inobt_rec_incore_t r;
505
504 /* 506 /*
505 * Loop as long as we're unable to read the 507 * Loop as long as we're unable to read the
506 * inode btree. 508 * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
510 if (XFS_AGINO_TO_AGBNO(mp, agino) >= 512 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
511 be32_to_cpu(agi->agi_length)) 513 be32_to_cpu(agi->agi_length))
512 break; 514 break;
513 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, 515 error = xfs_inobt_lookup(cur, agino,
514 &tmp); 516 XFS_LOOKUP_GE, &tmp);
515 cond_resched(); 517 cond_resched();
516 } 518 }
517 /* 519 /*
518 * If ran off the end of the ag either with an error, 520 * If ran off the end of the ag either with an error,
519 * or the normal way, set end and stop collecting. 521 * or the normal way, set end and stop collecting.
520 */ 522 */
521 if (error || 523 if (error) {
522 (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
523 &gfree, &i)) ||
524 i == 0) {
525 end_of_ag = 1; 524 end_of_ag = 1;
526 break; 525 break;
527 } 526 }
527
528 error = xfs_inobt_get_rec(cur, &r, &i);
529 if (error || i == 0) {
530 end_of_ag = 1;
531 break;
532 }
533
528 /* 534 /*
529 * If this chunk has any allocated inodes, save it. 535 * If this chunk has any allocated inodes, save it.
530 * Also start read-ahead now for this chunk. 536 * Also start read-ahead now for this chunk.
531 */ 537 */
532 if (gcnt < XFS_INODES_PER_CHUNK) { 538 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
533 /* 539 /*
534 * Loop over all clusters in the next chunk. 540 * Loop over all clusters in the next chunk.
535 * Do a readahead if there are any allocated 541 * Do a readahead if there are any allocated
536 * inodes in that cluster. 542 * inodes in that cluster.
537 */ 543 */
538 for (agbno = XFS_AGINO_TO_AGBNO(mp, gino), 544 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
539 chunkidx = 0; 545 for (chunkidx = 0;
540 chunkidx < XFS_INODES_PER_CHUNK; 546 chunkidx < XFS_INODES_PER_CHUNK;
541 chunkidx += nicluster, 547 chunkidx += nicluster,
542 agbno += nbcluster) { 548 agbno += nbcluster) {
543 if (xfs_inobt_maskn(chunkidx, 549 if (xfs_inobt_maskn(chunkidx, nicluster)
544 nicluster) & ~gfree) 550 & ~r.ir_free)
545 xfs_btree_reada_bufs(mp, agno, 551 xfs_btree_reada_bufs(mp, agno,
546 agbno, nbcluster); 552 agbno, nbcluster);
547 } 553 }
548 irbp->ir_startino = gino; 554 irbp->ir_startino = r.ir_startino;
549 irbp->ir_freecount = gcnt; 555 irbp->ir_freecount = r.ir_freecount;
550 irbp->ir_free = gfree; 556 irbp->ir_free = r.ir_free;
551 irbp++; 557 irbp++;
552 icount += XFS_INODES_PER_CHUNK - gcnt; 558 icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
553 } 559 }
554 /* 560 /*
555 * Set agino to after this chunk and bump the cursor. 561 * Set agino to after this chunk and bump the cursor.
556 */ 562 */
557 agino = gino + XFS_INODES_PER_CHUNK; 563 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
558 error = xfs_btree_increment(cur, 0, &tmp); 564 error = xfs_btree_increment(cur, 0, &tmp);
559 cond_resched(); 565 cond_resched();
560 } 566 }
@@ -820,9 +826,7 @@ xfs_inumbers(
820 int bufidx; 826 int bufidx;
821 xfs_btree_cur_t *cur; 827 xfs_btree_cur_t *cur;
822 int error; 828 int error;
823 __int32_t gcnt; 829 xfs_inobt_rec_incore_t r;
824 xfs_inofree_t gfree;
825 xfs_agino_t gino;
826 int i; 830 int i;
827 xfs_ino_t ino; 831 xfs_ino_t ino;
828 int left; 832 int left;
@@ -855,7 +859,8 @@ xfs_inumbers(
855 continue; 859 continue;
856 } 860 }
857 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno); 861 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
858 error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp); 862 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
863 &tmp);
859 if (error) { 864 if (error) {
860 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 865 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
861 cur = NULL; 866 cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
870 continue; 875 continue;
871 } 876 }
872 } 877 }
873 if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree, 878 error = xfs_inobt_get_rec(cur, &r, &i);
874 &i)) || 879 if (error || i == 0) {
875 i == 0) {
876 xfs_buf_relse(agbp); 880 xfs_buf_relse(agbp);
877 agbp = NULL; 881 agbp = NULL;
878 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 882 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
881 agino = 0; 885 agino = 0;
882 continue; 886 continue;
883 } 887 }
884 agino = gino + XFS_INODES_PER_CHUNK - 1; 888 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
885 buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino); 889 buffer[bufidx].xi_startino =
886 buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt; 890 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
887 buffer[bufidx].xi_allocmask = ~gfree; 891 buffer[bufidx].xi_alloccount =
892 XFS_INODES_PER_CHUNK - r.ir_freecount;
893 buffer[bufidx].xi_allocmask = ~r.ir_free;
888 bufidx++; 894 bufidx++;
889 left--; 895 left--;
890 if (bufidx == bcount) { 896 if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
99 void *dibuff, 99 void *dibuff,
100 int *stat); 100 int *stat);
101 101
102int
103xfs_internal_inum(
104 xfs_mount_t *mp,
105 xfs_ino_t ino);
106
107typedef int (*inumbers_fmt_pf)( 102typedef int (*inumbers_fmt_pf)(
108 void __user *ubuffer, /* buffer to write to */ 103 void __user *ubuffer, /* buffer to write to */
109 const xfs_inogrp_t *buffer, /* buffer to read from */ 104 const xfs_inogrp_t *buffer, /* buffer to read from */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int xlog_find_tail(xlog_t *log,
451extern int xlog_recover(xlog_t *log); 451extern int xlog_recover(xlog_t *log);
452extern int xlog_recover_finish(xlog_t *log); 452extern int xlog_recover_finish(xlog_t *log);
453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 453extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
454extern void xlog_recover_process_iunlinks(xlog_t *log);
455
456extern struct xfs_buf *xlog_get_bp(xlog_t *, int); 454extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
457extern void xlog_put_bp(struct xfs_buf *); 455extern void xlog_put_bp(struct xfs_buf *);
458 456
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
3263 * freeing of the inode and its removal from the list must be 3263 * freeing of the inode and its removal from the list must be
3264 * atomic. 3264 * atomic.
3265 */ 3265 */
3266void 3266STATIC void
3267xlog_recover_process_iunlinks( 3267xlog_recover_process_iunlinks(
3268 xlog_t *log) 3268 xlog_t *log)
3269{ 3269{
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
1568 * 1568 *
1569 * The m_sb_lock must be held when this routine is called. 1569 * The m_sb_lock must be held when this routine is called.
1570 */ 1570 */
1571int 1571STATIC int
1572xfs_mod_incore_sb_unlocked( 1572xfs_mod_incore_sb_unlocked(
1573 xfs_mount_t *mp, 1573 xfs_mount_t *mp,
1574 xfs_sb_field_t field, 1574 xfs_sb_field_t field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
414 414
415extern int xfs_log_sbcount(xfs_mount_t *, uint); 415extern int xfs_log_sbcount(xfs_mount_t *, uint);
416extern int xfs_mountfs(xfs_mount_t *mp); 416extern int xfs_mountfs(xfs_mount_t *mp);
417extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
418 417
419extern void xfs_unmountfs(xfs_mount_t *); 418extern void xfs_unmountfs(xfs_mount_t *);
420extern int xfs_unmountfs_writesb(xfs_mount_t *); 419extern int xfs_unmountfs_writesb(xfs_mount_t *);
421extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 420extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
422extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
423 int64_t, int);
424extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 421extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
425 uint, int); 422 uint, int);
426extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t); 423extern int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
564} 564}
565 565
566/* 566/*
567 * To look up an element using its key, but leave its location in the internal
568 * lists alone, call xfs_mru_cache_peek(). If the element isn't found, this
569 * function returns NULL.
570 *
571 * See the comments above the declaration of the xfs_mru_cache_lookup() function
572 * for important locking information pertaining to this call.
573 */
574void *
575xfs_mru_cache_peek(
576 xfs_mru_cache_t *mru,
577 unsigned long key)
578{
579 xfs_mru_cache_elem_t *elem;
580
581 ASSERT(mru && mru->lists);
582 if (!mru || !mru->lists)
583 return NULL;
584
585 spin_lock(&mru->lock);
586 elem = radix_tree_lookup(&mru->store, key);
587 if (!elem)
588 spin_unlock(&mru->lock);
589 else
590 __release(mru_lock); /* help sparse not be stupid */
591
592 return elem ? elem->value : NULL;
593}
594
595/*
596 * To release the internal data structure spinlock after having performed an 567 * To release the internal data structure spinlock after having performed an
597 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done() 568 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
598 * with the data store pointer. 569 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key); 49void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key); 50void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key); 51void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
52void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
53void xfs_mru_cache_done(struct xfs_mru_cache *mru); 52void xfs_mru_cache_done(struct xfs_mru_cache *mru);
54 53
55#endif /* __XFS_MRU_CACHE_H__ */ 54#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
88} 88}
89 89
90/* 90/*
91 * Handle logging requirements of various synchronous types of write.
92 */
93int
94xfs_write_sync_logforce(
95 xfs_mount_t *mp,
96 xfs_inode_t *ip)
97{
98 int error = 0;
99
100 /*
101 * If we're treating this as O_DSYNC and we have not updated the
102 * size, force the log.
103 */
104 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
105 !(ip->i_update_size)) {
106 xfs_inode_log_item_t *iip = ip->i_itemp;
107
108 /*
109 * If an allocation transaction occurred
110 * without extending the size, then we have to force
111 * the log up the proper point to ensure that the
112 * allocation is permanent. We can't count on
113 * the fact that buffered writes lock out direct I/O
114 * writes - the direct I/O write could have extended
115 * the size nontransactionally, then finished before
116 * we started. xfs_write_file will think that the file
117 * didn't grow but the update isn't safe unless the
118 * size change is logged.
119 *
120 * Force the log if we've committed a transaction
121 * against the inode or if someone else has and
122 * the commit record hasn't gone to disk (e.g.
123 * the inode is pinned). This guarantees that
124 * all changes affecting the inode are permanent
125 * when we return.
126 */
127 if (iip && iip->ili_last_lsn) {
128 error = _xfs_log_force(mp, iip->ili_last_lsn,
129 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
130 } else if (xfs_ipincount(ip) > 0) {
131 error = _xfs_log_force(mp, (xfs_lsn_t)0,
132 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
133 }
134
135 } else {
136 xfs_trans_t *tp;
137
138 /*
139 * O_SYNC or O_DSYNC _with_ a size update are handled
140 * the same way.
141 *
142 * If the write was synchronous then we need to make
143 * sure that the inode modification time is permanent.
144 * We'll have updated the timestamp above, so here
145 * we use a synchronous transaction to log the inode.
146 * It's not fast, but it's necessary.
147 *
148 * If this a dsync write and the size got changed
149 * non-transactionally, then we need to ensure that
150 * the size change gets logged in a synchronous
151 * transaction.
152 */
153 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
154 if ((error = xfs_trans_reserve(tp, 0,
155 XFS_SWRITE_LOG_RES(mp),
156 0, 0, 0))) {
157 /* Transaction reserve failed */
158 xfs_trans_cancel(tp, 0);
159 } else {
160 /* Transaction reserve successful */
161 xfs_ilock(ip, XFS_ILOCK_EXCL);
162 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
163 xfs_trans_ihold(tp, ip);
164 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
165 xfs_trans_set_sync(tp);
166 error = xfs_trans_commit(tp, 0);
167 xfs_iunlock(ip, XFS_ILOCK_EXCL);
168 }
169 }
170
171 return error;
172}
173
174/*
175 * Force a shutdown of the filesystem instantly while keeping 91 * Force a shutdown of the filesystem instantly while keeping
176 * the filesystem consistent. We don't do an unmount here; just shutdown 92 * the filesystem consistent. We don't do an unmount here; just shutdown
177 * the shop, make sure that absolutely nothing persistent happens to 93 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
68 * Prototypes for functions in xfs_rw.c. 68 * Prototypes for functions in xfs_rw.c.
69 */ 69 */
70extern int xfs_write_clear_setuid(struct xfs_inode *ip); 70extern int xfs_write_clear_setuid(struct xfs_inode *ip);
71extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
72extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp); 71extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
73extern int xfs_bioerror(struct xfs_buf *bp); 72extern int xfs_bioerror(struct xfs_buf *bp);
74extern int xfs_bioerror_relse(struct xfs_buf *bp); 73extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
78extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp, 77extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
79 xfs_buf_t *bp, xfs_daddr_t blkno); 78 xfs_buf_t *bp, xfs_daddr_t blkno);
80 79
81/*
82 * Prototypes for functions in xfs_vnodeops.c.
83 */
84extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
85 int flags);
86
87#endif /* __XFS_RW_H__ */ 80#endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
68#define XFS_TRANS_GROWFS 14 68#define XFS_TRANS_GROWFS 14
69#define XFS_TRANS_STRAT_WRITE 15 69#define XFS_TRANS_STRAT_WRITE 15
70#define XFS_TRANS_DIOSTRAT 16 70#define XFS_TRANS_DIOSTRAT 16
71#define XFS_TRANS_WRITE_SYNC 17 71/* 17 was XFS_TRANS_WRITE_SYNC */
72#define XFS_TRANS_WRITEID 18 72#define XFS_TRANS_WRITEID 18
73#define XFS_TRANS_ADDAFORK 19 73#define XFS_TRANS_ADDAFORK 19
74#define XFS_TRANS_ATTRINVAL 20 74#define XFS_TRANS_ATTRINVAL 20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
307 return (flags & XFS_BUF_TRYLOCK) ? 307 return (flags & XFS_BUF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM); 308 EAGAIN : XFS_ERROR(ENOMEM);
309 309
310 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { 310 if (XFS_BUF_GETERROR(bp) != 0) {
311 xfs_ioerror_alert("xfs_trans_read_buf", mp, 311 xfs_ioerror_alert("xfs_trans_read_buf", mp,
312 bp, blkno); 312 bp, blkno);
313 error = XFS_BUF_GETERROR(bp); 313 error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
315 return error; 315 return error;
316 } 316 }
317#ifdef DEBUG 317#ifdef DEBUG
318 if (xfs_do_error && (bp != NULL)) { 318 if (xfs_do_error) {
319 if (xfs_error_target == target) { 319 if (xfs_error_target == target) {
320 if (((xfs_req_num++) % xfs_error_mod) == 0) { 320 if (((xfs_req_num++) % xfs_error_mod) == 0) {
321 xfs_buf_relse(bp); 321 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
49 49
50 50
51/* 51/*
52 * Get and lock the inode for the caller if it is not already 52 * Get an inode and join it to the transaction.
53 * locked within the given transaction. If it is already locked
54 * within the transaction, just increment its lock recursion count
55 * and return a pointer to it.
56 *
57 * For an inode to be locked in a transaction, the inode lock, as
58 * opposed to the io lock, must be taken exclusively. This ensures
59 * that the inode can be involved in only 1 transaction at a time.
60 * Lock recursion is handled on the io lock, but only for lock modes
61 * of equal or lesser strength. That is, you can recur on the io lock
62 * held EXCL with a SHARED request but not vice versa. Also, if
63 * the inode is already a part of the transaction then you cannot
64 * go from not holding the io lock to having it EXCL or SHARED.
65 *
66 * Use the inode cache routine xfs_inode_incore() to find the inode
67 * if it is already owned by this transaction.
68 *
69 * If we don't already own the inode, use xfs_iget() to get it.
70 * Since the inode log item structure is embedded in the incore
71 * inode structure and is initialized when the inode is brought
72 * into memory, there is nothing to do with it here.
73 *
74 * If the given transaction pointer is NULL, just call xfs_iget().
75 * This simplifies code which must handle both cases.
76 */ 53 */
77int 54int
78xfs_trans_iget( 55xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
84 xfs_inode_t **ipp) 61 xfs_inode_t **ipp)
85{ 62{
86 int error; 63 int error;
87 xfs_inode_t *ip;
88
89 /*
90 * If the transaction pointer is NULL, just call the normal
91 * xfs_iget().
92 */
93 if (tp == NULL)
94 return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
95
96 /*
97 * If we find the inode in core with this transaction
98 * pointer in its i_transp field, then we know we already
99 * have it locked. In this case we just increment the lock
100 * recursion count and return the inode to the caller.
101 * Assert that the inode is already locked in the mode requested
102 * by the caller. We cannot do lock promotions yet, so
103 * die if someone gets this wrong.
104 */
105 if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
106 /*
107 * Make sure that the inode lock is held EXCL and
108 * that the io lock is never upgraded when the inode
109 * is already a part of the transaction.
110 */
111 ASSERT(ip->i_itemp != NULL);
112 ASSERT(lock_flags & XFS_ILOCK_EXCL);
113 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
114 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
115 xfs_isilocked(ip, XFS_IOLOCK_EXCL));
116 ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
117 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
118 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
119 xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
120 ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
121 (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
122
123 if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
124 ip->i_itemp->ili_iolock_recur++;
125 }
126 if (lock_flags & XFS_ILOCK_EXCL) {
127 ip->i_itemp->ili_ilock_recur++;
128 }
129 *ipp = ip;
130 return 0;
131 }
132
133 ASSERT(lock_flags & XFS_ILOCK_EXCL);
134 error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
135 if (error) {
136 return error;
137 }
138 ASSERT(ip != NULL);
139 64
140 xfs_trans_ijoin(tp, ip, lock_flags); 65 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
141 *ipp = ip; 66 if (!error && tp)
142 return 0; 67 xfs_trans_ijoin(tp, *ipp, lock_flags);
68 return error;
143} 69}
144 70
145/* 71/*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
163 xfs_inode_item_init(ip, ip->i_mount); 89 xfs_inode_item_init(ip, ip->i_mount);
164 iip = ip->i_itemp; 90 iip = ip->i_itemp;
165 ASSERT(iip->ili_flags == 0); 91 ASSERT(iip->ili_flags == 0);
166 ASSERT(iip->ili_ilock_recur == 0);
167 ASSERT(iip->ili_iolock_recur == 0);
168 92
169 /* 93 /*
170 * Get a log_item_desc to point at the new item. 94 * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 492d75bae2bf..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -611,7 +611,7 @@ xfs_fsync(
611 xfs_inode_t *ip) 611 xfs_inode_t *ip)
612{ 612{
613 xfs_trans_t *tp; 613 xfs_trans_t *tp;
614 int error; 614 int error = 0;
615 int log_flushed = 0, changed = 1; 615 int log_flushed = 0, changed = 1;
616 616
617 xfs_itrace_entry(ip); 617 xfs_itrace_entry(ip);
@@ -619,14 +619,9 @@ xfs_fsync(
619 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 619 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
620 return XFS_ERROR(EIO); 620 return XFS_ERROR(EIO);
621 621
622 /* capture size updates in I/O completion before writing the inode. */
623 error = xfs_wait_on_pages(ip, 0, -1);
624 if (error)
625 return XFS_ERROR(error);
626
627 /* 622 /*
628 * We always need to make sure that the required inode state is safe on 623 * We always need to make sure that the required inode state is safe on
629 * disk. The vnode might be clean but we still might need to force the 624 * disk. The inode might be clean but we still might need to force the
630 * log because of committed transactions that haven't hit the disk yet. 625 * log because of committed transactions that haven't hit the disk yet.
631 * Likewise, there could be unflushed non-transactional changes to the 626 * Likewise, there could be unflushed non-transactional changes to the
632 * inode core that have to go to disk and this requires us to issue 627 * inode core that have to go to disk and this requires us to issue
@@ -638,7 +633,7 @@ xfs_fsync(
638 */ 633 */
639 xfs_ilock(ip, XFS_ILOCK_SHARED); 634 xfs_ilock(ip, XFS_ILOCK_SHARED);
640 635
641 if (!(ip->i_update_size || ip->i_update_core)) { 636 if (!ip->i_update_core) {
642 /* 637 /*
643 * Timestamps/size haven't changed since last inode flush or 638 * Timestamps/size haven't changed since last inode flush or
644 * inode transaction commit. That means either nothing got 639 * inode transaction commit. That means either nothing got
@@ -718,7 +713,7 @@ xfs_fsync(
718 * when the link count isn't zero and by xfs_dm_punch_hole() when 713 * when the link count isn't zero and by xfs_dm_punch_hole() when
719 * punching a hole to EOF. 714 * punching a hole to EOF.
720 */ 715 */
721int 716STATIC int
722xfs_free_eofblocks( 717xfs_free_eofblocks(
723 xfs_mount_t *mp, 718 xfs_mount_t *mp,
724 xfs_inode_t *ip, 719 xfs_inode_t *ip,
@@ -1476,8 +1471,8 @@ xfs_create(
1476 if (error == ENOSPC) { 1471 if (error == ENOSPC) {
1477 /* flush outstanding delalloc blocks and retry */ 1472 /* flush outstanding delalloc blocks and retry */
1478 xfs_flush_inodes(dp); 1473 xfs_flush_inodes(dp);
1479 error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0, 1474 error = xfs_trans_reserve(tp, resblks, log_res, 0,
1480 XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT); 1475 XFS_TRANS_PERM_LOG_RES, log_count);
1481 } 1476 }
1482 if (error == ENOSPC) { 1477 if (error == ENOSPC) {
1483 /* No space at all so try a "no-allocation" reservation */ 1478 /* No space at all so try a "no-allocation" reservation */